diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py new file mode 100644 index 000000000..881f40671 --- /dev/null +++ b/openml/_api/__init__.py @@ -0,0 +1,8 @@ +from openml._api.runtime.core import APIContext + + +def set_api_version(version: str, *, strict: bool = False) -> None: + api_context.set_version(version=version, strict=strict) + + +api_context = APIContext() diff --git a/openml/_api/config.py b/openml/_api/config.py new file mode 100644 index 000000000..848fe8da1 --- /dev/null +++ b/openml/_api/config.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +DelayMethod = Literal["human", "robot"] + + +@dataclass +class APIConfig: + server: str + base_url: str + key: str + timeout: int = 10 # seconds + + +@dataclass +class APISettings: + v1: APIConfig + v2: APIConfig + + +@dataclass +class ConnectionConfig: + retries: int = 3 + delay_method: DelayMethod = "human" + delay_time: int = 1 # seconds + + def __post_init__(self) -> None: + if self.delay_method not in ("human", "robot"): + raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}") + + +@dataclass +class CacheConfig: + dir: str = "~/.openml/cache" + ttl: int = 60 * 60 * 24 * 7 # one week + + +@dataclass +class Settings: + api: APISettings + connection: ConnectionConfig + cache: CacheConfig + + +settings = Settings( + api=APISettings( + v1=APIConfig( + server="https://www.openml.org/", + base_url="api/v1/xml/", + key="...", + ), + v2=APIConfig( + server="http://127.0.0.1:8001/", + base_url="", + key="...", + ), + ), + connection=ConnectionConfig(), + cache=CacheConfig(), +) diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py new file mode 100644 index 000000000..8e6d1e4ce --- /dev/null +++ b/openml/_api/http/__init__.py @@ -0,0 +1,3 @@ +from openml._api.http.client import HTTPClient + +__all__ = ["HTTPClient"] diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py new file mode 100644 index 000000000..a90e93933 --- /dev/null +++ b/openml/_api/http/client.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any +from urllib.parse import urlencode, urljoin, urlparse + +import requests +from requests import Response + +from openml.__version__ import __version__ +from openml._api.config import settings + +if TYPE_CHECKING: + from openml._api.config import APIConfig + + +class CacheMixin: + @property + def dir(self) -> str: + return settings.cache.dir + + @property + def ttl(self) -> int: + return settings.cache.ttl + + def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path: + parsed_url = urlparse(url) + netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain + path_parts = parsed_url.path.strip("/").split("/") + + # remove api_key and serialize params if any + filtered_params = {k: v for k, v in params.items() if k != "api_key"} + params_part = [urlencode(filtered_params)] if filtered_params else [] + + return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part) + + def _get_cache_response(self, cache_dir: Path) -> Response: # noqa: ARG002 + return Response() + + def _set_cache_response(self, cache_dir: Path, response: Response) -> None: # noqa: ARG002 + return None + + +class HTTPClient(CacheMixin): + def __init__(self, config: APIConfig) -> None: + self.config = config + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + + @property + def server(self) -> str: + return self.config.server + + @property + def base_url(self) -> str: + return self.config.base_url + + @property + def key(self) -> str: + return self.config.key + + @property + def timeout(self) -> int: + return self.config.timeout + + def request( + self, + method: str, + path: str, + *, + use_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + url = urljoin(self.server, urljoin(self.base_url, path)) + + params = request_kwargs.pop("params", {}) + params = params.copy() + if use_api_key: + params["api_key"] = self.key + + headers = request_kwargs.pop("headers", {}) + headers = headers.copy() + headers.update(self.headers) + + timeout = request_kwargs.pop("timeout", self.timeout) + cache_dir = self._get_cache_dir(url, params) + + if use_cache: + try: + return self._get_cache_response(cache_dir) + # TODO: handle ttl expired error + except Exception: + raise + + response = requests.request( + method=method, + url=url, + params=params, + headers=headers, + timeout=timeout, + **request_kwargs, + ) + + if use_cache: + self._set_cache_response(cache_dir, response) + + return response + + def get( + self, + path: str, + *, + use_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + # TODO: remove override when cache is implemented + use_cache = False + return self.request( + method="GET", + path=path, + use_cache=use_cache, + use_api_key=use_api_key, + **request_kwargs, + ) + + def post( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="POST", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) + + def delete( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="DELETE", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) diff --git a/openml/_api/http/utils.py b/openml/_api/http/utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py new file mode 100644 index 000000000..b1af3c1a8 --- /dev/null +++ b/openml/_api/resources/__init__.py @@ -0,0 +1,4 @@ +from openml._api.resources.datasets import DatasetsV1, DatasetsV2 +from openml._api.resources.tasks import TasksV1, TasksV2 + +__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"] diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py new file mode 100644 index 000000000..990dda998 --- /dev/null +++ b/openml/_api/resources/base.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any +from typing_extensions import Literal + +if TYPE_CHECKING: + import pandas as pd + from requests import Response + + from openml._api.http import HTTPClient + from openml.datasets.dataset import OpenMLDataset + from openml.tasks.task import OpenMLTask + + +class ResourceAPI: + def __init__(self, http: HTTPClient): + self._http = http + + +class DatasetsAPI(ResourceAPI, ABC): + @abstractmethod + def get( + self, + dataset_id: int | str, + *, + return_response: bool = False, + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... + + @abstractmethod + def list( + self, + limit: int, + offset: int, + *, + data_id: list[int] | None = None, # type: ignore + **kwargs: Any, + ) -> pd.DataFrame: ... + + @abstractmethod + def delete(self, dataset_id: int) -> bool: ... + + @abstractmethod + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: ... + + @abstractmethod + def fork(self, data_id: int) -> int: ... + + @abstractmethod + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ... + + @abstractmethod + def list_qualities(self) -> list[str]: ... # type: ignore + + @abstractmethod + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... + + @abstractmethod + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... + + +class TasksAPI(ResourceAPI, ABC): + @abstractmethod + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: ... diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py new file mode 100644 index 000000000..845212b20 --- /dev/null +++ b/openml/_api/resources/datasets.py @@ -0,0 +1,650 @@ +from __future__ import annotations + +from collections import OrderedDict +from typing import TYPE_CHECKING, Any +from typing_extensions import Literal + +if TYPE_CHECKING: + from requests import Response + +import pandas as pd +import xmltodict + +import openml.utils +from openml._api.resources.base import DatasetsAPI +from openml.datasets.dataset import OpenMLDataset + + +class DatasetsV1(DatasetsAPI): + def get( + self, + dataset_id: int | str, + *, + return_response: bool = False, + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + path = f"data/{dataset_id}" + response = self._http.get(path) + xml_content = response.text + dataset = self._create_dataset_from_xml(xml_content) + + if return_response: + return dataset, response + + return dataset + + def list( + self, + limit: int, + offset: int, + *, + data_id: list[int] | None = None, # type: ignore + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values. + + Returns + ------- + datasets : dataframe + """ + api_call = "data/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + api_call += f"/{operator}/{value}" + if data_id is not None: + api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" + + xml_string = self._http.get(api_call).text + datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) + + # Minimalistic check if the XML is useful + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], + ) + assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ + "oml:data" + ]["@xmlns:oml"] + + datasets = {} + for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: + ignore_attribute = ["oml:file_id", "oml:quality"] + dataset = { + k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute + } + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("oml:quality", []): + try: + dataset[quality["@name"]] = int(quality["#text"]) + except ValueError: + dataset[quality["@name"]] = float(quality["#text"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def delete(self, dataset_id: int) -> bool: + """Delete dataset with id `dataset_id` from the OpenML server. + + This can only be done if you are the owner of the dataset and + no tasks are attached to the dataset. + + Parameters + ---------- + dataset_id : int + OpenML id of the dataset + + Returns + ------- + bool + True if the deletion was successful. False otherwise. + """ + return openml.utils._delete_entity("data", dataset_id) + + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + """Edits an OpenMLDataset. + + In addition to providing the dataset id of the dataset to edit (through data_id), + you must specify a value for at least one of the optional function arguments, + i.e. one value for a field to edit. + + This function allows editing of both non-critical and critical fields. + Critical fields are default_target_attribute, ignore_attribute, row_id_attribute. + + - Editing non-critical data fields is allowed for all authenticated users. + - Editing critical fields is allowed only for the owner, provided there are no tasks + associated with this dataset. + + If dataset has tasks or if the user is not the owner, the only way + to edit critical fields is to use fork_dataset followed by edit_dataset. + + Parameters + ---------- + data_id : int + ID of the dataset. + description : str + Description of the dataset. + creator : str + The person who created the dataset. + contributor : str + People who contributed to the current version of the dataset. + collection_date : str + The date the data was originally collected, given by the uploader. + language : str + Language in which the data is represented. + Starts with 1 upper case letter, rest lower case, e.g. 'English'. + default_target_attribute : str + The default target attribute, if it exists. + Can have multiple values, comma separated. + ignore_attribute : str | list + Attributes that should be excluded in modelling, + such as identifiers and indexes. + citation : str + Reference(s) that should be cited when building on this data. + row_id_attribute : str, optional + The attribute that represents the row-id column, if present in the + dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not + specified, the index of the dataframe will be used as the + ``row_id_attribute``. If the name of the index is ``None``, it will + be discarded. + + .. versionadded: 0.8 + Inference of ``row_id_attribute`` from a dataframe. + original_data_url : str, optional + For derived data, the url to the original dataset. + paper_url : str, optional + Link to a paper describing the dataset. + + Returns + ------- + Dataset id + """ + if not isinstance(data_id, int): + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + + # compose data edit parameters as xml + form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE + xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + xml["oml:data_edit_parameters"] = OrderedDict() + xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" + xml["oml:data_edit_parameters"]["oml:description"] = description + xml["oml:data_edit_parameters"]["oml:creator"] = creator + xml["oml:data_edit_parameters"]["oml:contributor"] = contributor + xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date + xml["oml:data_edit_parameters"]["oml:language"] = language + xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute + xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute + xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute + xml["oml:data_edit_parameters"]["oml:citation"] = citation + xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url + xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url + + # delete None inputs + for k in list(xml["oml:data_edit_parameters"]): + if not xml["oml:data_edit_parameters"][k]: + del xml["oml:data_edit_parameters"][k] + + file_elements = { + "edit_parameters": ("description.xml", xmltodict.unparse(xml)), + } # type: openml._api_calls.FILE_ELEMENTS_TYPE + result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text + result = xmltodict.parse(result_xml) + data_id = result["oml:data_edit"]["oml:id"] + return int(data_id) + + def fork(self, data_id: int) -> int: + """ + Creates a new dataset version, with the authenticated user as the new owner. + The forked dataset can have distinct dataset meta-data, + but the actual data itself is shared with the original version. + + This API is intended for use when a user is unable to edit the critical fields of a dataset + through the edit_dataset API. + (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.) + + Specifically, this happens when the user is: + 1. Not the owner of the dataset. + 2. User is the owner of the dataset, but the dataset has tasks. + + In these two cases the only way to edit critical fields is: + 1. STEP 1: Fork the dataset using fork_dataset API + 2. STEP 2: Call edit_dataset API on the forked version. + + + Parameters + ---------- + data_id : int + id of the dataset to be forked + + Returns + ------- + Dataset id of the forked dataset + + """ + if not isinstance(data_id, int): + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + # compose data fork parameters + form_data = {"data_id": data_id} + result_xml = self._http.post("data/fork", data=form_data).text + result = xmltodict.parse(result_xml) + data_id = result["oml:data_fork"]["oml:id"] + return int(data_id) + + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + data_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} + result_xml = self._http.post("data/status/update", data=data).text + result = xmltodict.parse(result_xml) + server_data_id = result["oml:data_status_update"]["oml:id"] + server_status = result["oml:data_status_update"]["oml:status"] + if status != server_status or int(data_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + + def list_qualities(self) -> list[str]: # type: ignore + """Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "data/qualities/list" + xml_string = self._http.get(api_call).text + qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) + # Minimalistic check if the XML is useful + if "oml:data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + + if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): + raise TypeError('Error in return XML, does not contain "oml:quality" as a list') + + return qualities["oml:data_qualities_list"]["oml:quality"] + + def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: + """Create a dataset given a xml string. + + Parameters + ---------- + xml : string + Dataset xml representation. + + Returns + ------- + OpenMLDataset + """ + description = xmltodict.parse(xml)["oml:data_set_description"] + + # TODO file path after download, cache_format default = 'pickle' + arff_file = None + features_file = None + parquet_file = None + qualities_file = None + + return OpenMLDataset( + description["oml:name"], + description.get("oml:description"), + data_format=description["oml:format"], + dataset_id=int(description["oml:id"]), + version=int(description["oml:version"]), + creator=description.get("oml:creator"), + contributor=description.get("oml:contributor"), + collection_date=description.get("oml:collection_date"), + upload_date=description.get("oml:upload_date"), + language=description.get("oml:language"), + licence=description.get("oml:licence"), + url=description["oml:url"], + default_target_attribute=description.get("oml:default_target_attribute"), + row_id_attribute=description.get("oml:row_id_attribute"), + ignore_attribute=description.get("oml:ignore_attribute"), + version_label=description.get("oml:version_label"), + citation=description.get("oml:citation"), + tag=description.get("oml:tag"), + visibility=description.get("oml:visibility"), + original_data_url=description.get("oml:original_data_url"), + paper_url=description.get("oml:paper_url"), + update_comment=description.get("oml:update_comment"), + md5_checksum=description.get("oml:md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=description.get("oml:parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) + + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + """ + An ontology describes the concept that are described in a feature. An + ontology is defined by an URL where the information is provided. Adds + an ontology (URL) to a given dataset feature (defined by a dataset id + and index). The dataset has to exists on OpenML and needs to have been + processed by the evaluation engine. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": data_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/add", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + """ + Removes an existing ontology (URL) from a given dataset feature (defined + by a dataset id and index). The dataset has to exists on OpenML and needs + to have been processed by the evaluation engine. Ontology needs to be + attached to the specific fearure. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": data_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/remove", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + +class DatasetsV2(DatasetsAPI): + def get( + self, + dataset_id: int | str, + *, + return_response: bool = False, + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + path = f"data/{dataset_id}" + response = self._http.get(path) + json_content = response.json() + dataset = self._create_dataset_from_json(json_content) + + if return_response: + return dataset, response + + return dataset + + def list( + self, + limit: int, + offset: int, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values, data_id. + + Returns + ------- + datasets : dataframe + """ + json: dict[str, Any] = {"pagination": {}} + + if limit is not None: + json["pagination"]["limit"] = limit + if offset is not None: + json["pagination"]["offset"] = offset + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + json[operator] = value + + api_call = "datasets/list" + datasets_list = self._http.post(api_call, json=json).json() + # Minimalistic check if the JSON is useful + assert isinstance(datasets_list, list), type(datasets_list) + + datasets = {} + for dataset_ in datasets_list: + ignore_attribute = ["file_id", "quality"] + dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute} + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("quality", []): + try: + dataset[quality["name"]] = int(quality["value"]) + except ValueError: + dataset[quality["name"]] = float(quality["value"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def delete(self, dataset_id: int) -> bool: + raise NotImplementedError() + + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + raise NotImplementedError() + + def fork(self, data_id: int) -> int: + raise NotImplementedError() + + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + data_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status} + result = self._http.post("datasets/status/update", json=data).json() + server_data_id = result["dataset_id"] + server_status = result["status"] + if status != server_status or int(data_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + + def list_qualities(self) -> list[str]: # type: ignore + """Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "datasets/qualities/list" + qualities = self._http.get(api_call).json() + # Minimalistic check if the XML is useful + if "data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + + if not isinstance(qualities["data_qualities_list"]["quality"], list): + raise TypeError('Error in return json, does not contain "quality" as a list') + + return qualities["data_qualities_list"]["quality"] + + def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: + """Create a dataset given a json. + + Parameters + ---------- + json_content : dict + Dataset dict/json representation. + + Returns + ------- + OpenMLDataset + """ + # TODO file path after download, cache_format default = 'pickle' + arff_file = None + features_file = None + parquet_file = None + qualities_file = None + + return OpenMLDataset( + json_content["name"], + json_content.get("description"), + data_format=json_content["format"], + dataset_id=int(json_content["id"]), + version=int(json_content["version"]), + creator=json_content.get("creator"), + contributor=json_content.get("contributor"), + collection_date=json_content.get("collection_date"), + upload_date=json_content.get("upload_date"), + language=json_content.get("language"), + licence=json_content.get("licence"), + url=json_content["url"], + default_target_attribute=json_content.get("default_target_attribute"), + row_id_attribute=json_content.get("row_id_attribute"), + ignore_attribute=json_content.get("ignore_attribute"), + version_label=json_content.get("version_label"), + citation=json_content.get("citation"), + tag=json_content.get("tag"), + visibility=json_content.get("visibility"), + original_data_url=json_content.get("original_data_url"), + paper_url=json_content.get("paper_url"), + update_comment=json_content.get("update_comment"), + md5_checksum=json_content.get("md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=json_content.get("parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) + + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError() + + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError() diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py new file mode 100644 index 000000000..f494fb9a3 --- /dev/null +++ b/openml/_api/resources/tasks.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import xmltodict + +from openml._api.resources.base import TasksAPI +from openml.tasks.task import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLTask, + TaskType, +) + +if TYPE_CHECKING: + from requests import Response + + +class TasksV1(TasksAPI): + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + path = f"task/{task_id}" + response = self._http.get(path) + xml_content = response.text + task = self._create_task_from_xml(xml_content) + + if return_response: + return task, response + + return task + + def _create_task_from_xml(self, xml: str) -> OpenMLTask: + """Create a task given a xml string. + + Parameters + ---------- + xml : string + Task xml representation. + + Returns + ------- + OpenMLTask + """ + dic = xmltodict.parse(xml)["oml:task"] + estimation_parameters = {} + inputs = {} + # Due to the unordered structure we obtain, we first have to extract + # the possible keys of oml:input; dic["oml:input"] is a list of + # OrderedDicts + + # Check if there is a list of inputs + if isinstance(dic["oml:input"], list): + for input_ in dic["oml:input"]: + name = input_["@name"] + inputs[name] = input_ + # Single input case + elif isinstance(dic["oml:input"], dict): + name = dic["oml:input"]["@name"] + inputs[name] = dic["oml:input"] + + evaluation_measures = None + if "evaluation_measures" in inputs: + evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][ + "oml:evaluation_measure" + ] + + task_type = TaskType(int(dic["oml:task_type_id"])) + common_kwargs = { + "task_id": dic["oml:task_id"], + "task_type": dic["oml:task_type"], + "task_type_id": task_type, + "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"], + "evaluation_measure": evaluation_measures, + } + # TODO: add OpenMLClusteringTask? + if task_type in ( + TaskType.SUPERVISED_CLASSIFICATION, + TaskType.SUPERVISED_REGRESSION, + TaskType.LEARNING_CURVE, + ): + # Convert some more parameters + for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][ + "oml:parameter" + ]: + name = parameter["@name"] + text = parameter.get("#text", "") + estimation_parameters[name] = text + + common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:type"] + common_kwargs["estimation_procedure_id"] = int( + inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"] + ) + + common_kwargs["estimation_parameters"] = estimation_parameters + common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"][ + "oml:target_feature" + ] + common_kwargs["data_splits_url"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:data_splits_url"] + + cls = { + TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, + TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, + TaskType.CLUSTERING: OpenMLClusteringTask, + TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, + }.get(task_type) + if cls is None: + raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.") + return cls(**common_kwargs) # type: ignore + + +class TasksV2(TasksAPI): + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + raise NotImplementedError diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py new file mode 100644 index 000000000..98b587411 --- /dev/null +++ b/openml/_api/runtime/core.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml._api.config import settings +from openml._api.http.client import HTTPClient +from openml._api.resources import ( + DatasetsV1, + DatasetsV2, + TasksV1, + TasksV2, +) + +if TYPE_CHECKING: + from openml._api.resources.base import DatasetsAPI, TasksAPI + + +class APIBackend: + def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI): + self.datasets = datasets + self.tasks = tasks + + +def build_backend(version: str, *, strict: bool) -> APIBackend: + v1_http = HTTPClient(config=settings.api.v1) + v2_http = HTTPClient(config=settings.api.v2) + + v1 = APIBackend( + datasets=DatasetsV1(v1_http), + tasks=TasksV1(v1_http), + ) + + if version == "v1": + return v1 + + v2 = APIBackend( + datasets=DatasetsV2(v2_http), + tasks=TasksV2(v2_http), + ) + + if strict: + return v2 + + return v1 + + +class APIContext: + def __init__(self) -> None: + self._backend = build_backend("v1", strict=False) + + def set_version(self, version: str, *, strict: bool = False) -> None: + self._backend = build_backend(version=version, strict=strict) + + @property + def backend(self) -> APIBackend: + return self._backend diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py new file mode 100644 index 000000000..1bc99d270 --- /dev/null +++ b/openml/_api/runtime/fallback.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openml._api.resources.base import ResourceAPI + + +class FallbackProxy: + def __init__(self, primary: ResourceAPI, fallback: ResourceAPI): + self._primary = primary + self._fallback = fallback diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index ac5466a44..6ede42ea9 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -5,7 +5,6 @@ import logging import os import warnings -from collections import OrderedDict from functools import partial from pathlib import Path from pyexpat import ExpatError @@ -65,17 +64,9 @@ def list_qualities() -> list[str]: ------- list """ - api_call = "data/qualities/list" - xml_string = openml._api_calls._perform_api_call(api_call, "get") - qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) - # Minimalistic check if the XML is useful - if "oml:data_qualities_list" not in qualities: - raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + from openml._api import api_context - if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): - raise TypeError('Error in return XML, does not contain "oml:quality" as a list') - - return qualities["oml:data_qualities_list"]["oml:quality"] + return api_context.backend.datasets.list_qualities() def list_datasets( @@ -128,8 +119,10 @@ def list_datasets( If qualities are calculated for the dataset, some of these are also included as columns. """ + from openml._api import api_context + listing_call = partial( - _list_datasets, + api_context.backend.datasets.list, data_id=data_id, status=status, tag=tag, @@ -147,92 +140,6 @@ def list_datasets( return pd.concat(batches) -def _list_datasets( - limit: int, - offset: int, - *, - data_id: list[int] | None = None, - **kwargs: Any, -) -> pd.DataFrame: - """ - Perform api call to return a list of all datasets. - - Parameters - ---------- - The arguments that are lists are separated from the single value - ones which are put into the kwargs. - display_errors is also separated from the kwargs since it has a - default value. - - limit : int - The maximum number of datasets to show. - offset : int - The number of datasets to skip, starting from the first. - data_id : list, optional - - kwargs : dict, optional - Legal filter operators (keys in the dict): - tag, status, limit, offset, data_name, data_version, number_instances, - number_features, number_classes, number_missing_values. - - Returns - ------- - datasets : dataframe - """ - api_call = "data/list" - - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - if data_id is not None: - api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" - return __list_datasets(api_call=api_call) - - -def __list_datasets(api_call: str) -> pd.DataFrame: - xml_string = openml._api_calls._perform_api_call(api_call, "get") - datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) - - # Minimalistic check if the XML is useful - assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( - datasets_dict["oml:data"], - ) - assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ - "oml:data" - ]["@xmlns:oml"] - - datasets = {} - for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: - ignore_attribute = ["oml:file_id", "oml:quality"] - dataset = { - k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute - } - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) - - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", []): - try: - dataset[quality["@name"]] = int(quality["#text"]) - except ValueError: - dataset[quality["@name"]] = float(quality["#text"]) - datasets[dataset["did"]] = dataset - - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) - - def _expand_parameter(parameter: str | list[str] | None) -> list[str]: expanded_parameter = [] if isinstance(parameter, str): @@ -804,18 +711,13 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non status : str, 'active' or 'deactivated' """ + from openml._api import api_context + legal_status = {"active", "deactivated"} if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} - result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data) - result = xmltodict.parse(result_xml) - server_data_id = result["oml:data_status_update"]["oml:id"] - server_status = result["oml:data_status_update"]["oml:status"] - if status != server_status or int(data_id) != int(server_data_id): - # This should never happen - raise ValueError("Data id/status does not collide") + api_context.backend.datasets.status_update(data_id=data_id, status=status) def edit_dataset( @@ -889,43 +791,22 @@ def edit_dataset( ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + from openml._api import api_context - # compose data edit parameters as xml - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' - xml["oml:data_edit_parameters"] = OrderedDict() - xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" - xml["oml:data_edit_parameters"]["oml:description"] = description - xml["oml:data_edit_parameters"]["oml:creator"] = creator - xml["oml:data_edit_parameters"]["oml:contributor"] = contributor - xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date - xml["oml:data_edit_parameters"]["oml:language"] = language - xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute - xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute - xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute - xml["oml:data_edit_parameters"]["oml:citation"] = citation - xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url - xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url - - # delete None inputs - for k in list(xml["oml:data_edit_parameters"]): - if not xml["oml:data_edit_parameters"][k]: - del xml["oml:data_edit_parameters"][k] - - file_elements = { - "edit_parameters": ("description.xml", xmltodict.unparse(xml)), - } # type: openml._api_calls.FILE_ELEMENTS_TYPE - result_xml = openml._api_calls._perform_api_call( - "data/edit", - "post", - data=form_data, - file_elements=file_elements, + return api_context.backend.datasets.edit( + data_id, + description, + creator, + contributor, + collection_date, + language, + default_target_attribute, + ignore_attribute, + citation, + row_id_attribute, + original_data_url, + paper_url, ) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_edit"]["oml:id"] - return int(data_id) def fork_dataset(data_id: int) -> int: @@ -957,14 +838,9 @@ def fork_dataset(data_id: int) -> int: Dataset id of the forked dataset """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - # compose data fork parameters - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_fork"]["oml:id"] - return int(data_id) + from openml._api import api_context + + return api_context.backend.datasets.fork(data_id=data_id) def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -988,10 +864,9 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + from openml._api import api_context + + return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology) def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -1014,10 +889,9 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + from openml._api import api_context + + return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology) def _topic_add_dataset(data_id: int, topic: str) -> int: @@ -1460,4 +1334,6 @@ def delete_dataset(dataset_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ - return openml.utils._delete_entity("data", dataset_id) + from openml._api import api_context + + return api_context.backend.datasets.delete(dataset_id) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index e9b879ae4..a794ad56d 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -12,6 +12,7 @@ import openml._api_calls import openml.utils +from openml._api import api_context from openml.datasets import get_dataset from openml.exceptions import OpenMLCacheException @@ -444,11 +445,16 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") + result = api_context.backend.tasks.get(task_id, return_response=True) - with xml_file.open("w", encoding="utf8") as fh: - fh.write(task_xml) - return _create_task_from_xml(task_xml) + if isinstance(result, tuple): + task, response = result + with xml_file.open("w", encoding="utf8") as fh: + fh.write(response.text) + else: + task = result + + return task def _create_task_from_xml(xml: str) -> OpenMLTask: