From 0159f474c6bbc15f20d52bc946bd252bd852b196 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 30 Dec 2025 09:11:27 +0500
Subject: [PATCH 01/10] set up folder structure and base code

---
 openml/_api/__init__.py           |   8 +++
 openml/_api/config.py             |   5 ++
 openml/_api/http/__init__.py      |   1 +
 openml/_api/http/client.py        |  23 ++++++
 openml/_api/http/utils.py         |   0
 openml/_api/resources/__init__.py |   2 +
 openml/_api/resources/base.py     |  22 ++++++
 openml/_api/resources/datasets.py |  13 ++++
 openml/_api/resources/tasks.py    | 113 ++++++++++++++++++++++++++++++
 openml/_api/runtime/core.py       |  58 +++++++++++++++
 openml/_api/runtime/fallback.py   |   5 ++
 openml/tasks/functions.py         |   8 ++-
 12 files changed, 255 insertions(+), 3 deletions(-)
 create mode 100644 openml/_api/__init__.py
 create mode 100644 openml/_api/config.py
 create mode 100644 openml/_api/http/__init__.py
 create mode 100644 openml/_api/http/client.py
 create mode 100644 openml/_api/http/utils.py
 create mode 100644 openml/_api/resources/__init__.py
 create mode 100644 openml/_api/resources/base.py
 create mode 100644 openml/_api/resources/datasets.py
 create mode 100644 openml/_api/resources/tasks.py
 create mode 100644 openml/_api/runtime/core.py
 create mode 100644 openml/_api/runtime/fallback.py

diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py
new file mode 100644
index 000000000..5089f94dd
--- /dev/null
+++ b/openml/_api/__init__.py
@@ -0,0 +1,8 @@
+from openml._api.runtime.core import APIContext
+
+
+def set_api_version(version: str, strict=False):
+    api_context.set_version(version=version, strict=strict)
+
+
+api_context = APIContext()
diff --git a/openml/_api/config.py b/openml/_api/config.py
new file mode 100644
index 000000000..bd93c3cad
--- /dev/null
+++ b/openml/_api/config.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+API_V1_SERVER = "https://www.openml.org/api/v1/xml"
+API_V2_SERVER = "http://127.0.0.1:8001"
+API_KEY = "..."
diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py
new file mode 100644
index 000000000..fde2a5b0a
--- /dev/null
+++ b/openml/_api/http/__init__.py
@@ -0,0 +1 @@
+from openml._api.http.client import HTTPClient
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
new file mode 100644
index 000000000..81a9213e3
--- /dev/null
+++ b/openml/_api/http/client.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import requests
+
+from openml.__version__ import __version__
+
+
+class HTTPClient:
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+        self.headers = {"user-agent": f"openml-python/{__version__}"}
+
+    def get(self, path, params=None):
+        url = f"{self.base_url}/{path}"
+        return requests.get(url, params=params, headers=self.headers)
+
+    def post(self, path, data=None, files=None):
+        url = f"{self.base_url}/{path}"
+        return requests.post(url, data=data, files=files, headers=self.headers)
+
+    def delete(self, path, params=None):
+        url = f"{self.base_url}/{path}"
+        return requests.delete(url, params=params, headers=self.headers)
diff --git a/openml/_api/http/utils.py b/openml/_api/http/utils.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py
new file mode 100644
index 000000000..078fc5998
--- /dev/null
+++ b/openml/_api/resources/__init__.py
@@ -0,0 +1,2 @@
+from openml._api.resources.datasets import DatasetsV1, DatasetsV2
+from openml._api.resources.tasks import TasksV1, TasksV2
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
new file mode 100644
index 000000000..1fae27665
--- /dev/null
+++ b/openml/_api/resources/base.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from openml._api.http import HTTPClient
+
+
+class ResourceAPI:
+    def __init__(self, http: HTTPClient):
+        self._http = http
+
+
+class DatasetsAPI(ResourceAPI, ABC):
+    @abstractmethod
+    def get(self, id: int) -> dict: ...
+
+
+class TasksAPI(ResourceAPI, ABC):
+    @abstractmethod
+    def get(self, id: int) -> dict: ...
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
new file mode 100644
index 000000000..cd1bb595a
--- /dev/null
+++ b/openml/_api/resources/datasets.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from openml._api.resources.base import DatasetsAPI
+
+
+class DatasetsV1(DatasetsAPI):
+    def get(self, id):
+        pass
+
+
+class DatasetsV2(DatasetsAPI):
+    def get(self, id):
+        pass
diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py
new file mode 100644
index 000000000..b0e9afbf8
--- /dev/null
+++ b/openml/_api/resources/tasks.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import xmltodict
+
+from openml._api.resources.base import TasksAPI
+from openml.tasks.task import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLTask,
+    TaskType,
+)
+
+
+class TasksV1(TasksAPI):
+    def get(self, id, return_response=False):
+        path = f"task/{id}"
+        response = self._http.get(path)
+        xml_content = response.content
+        task = self._create_task_from_xml(xml_content)
+
+        if return_response:
+            return task, response
+
+        return task
+
+    def _create_task_from_xml(self, xml: str) -> OpenMLTask:
+        """Create a task given a xml string.
+
+        Parameters
+        ----------
+        xml : string
+            Task xml representation.
+
+        Returns
+        -------
+        OpenMLTask
+        """
+        dic = xmltodict.parse(xml)["oml:task"]
+        estimation_parameters = {}
+        inputs = {}
+        # Due to the unordered structure we obtain, we first have to extract
+        # the possible keys of oml:input; dic["oml:input"] is a list of
+        # OrderedDicts
+
+        # Check if there is a list of inputs
+        if isinstance(dic["oml:input"], list):
+            for input_ in dic["oml:input"]:
+                name = input_["@name"]
+                inputs[name] = input_
+        # Single input case
+        elif isinstance(dic["oml:input"], dict):
+            name = dic["oml:input"]["@name"]
+            inputs[name] = dic["oml:input"]
+
+        evaluation_measures = None
+        if "evaluation_measures" in inputs:
+            evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
+                "oml:evaluation_measure"
+            ]
+
+        task_type = TaskType(int(dic["oml:task_type_id"]))
+        common_kwargs = {
+            "task_id": dic["oml:task_id"],
+            "task_type": dic["oml:task_type"],
+            "task_type_id": task_type,
+            "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
+            "evaluation_measure": evaluation_measures,
+        }
+        # TODO: add OpenMLClusteringTask?
+        if task_type in (
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.LEARNING_CURVE,
+        ):
+            # Convert some more parameters
+            for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
+                "oml:parameter"
+            ]:
+                name = parameter["@name"]
+                text = parameter.get("#text", "")
+                estimation_parameters[name] = text
+
+            common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
+                "oml:estimation_procedure"
+            ]["oml:type"]
+            common_kwargs["estimation_procedure_id"] = int(
+                inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+            )
+
+            common_kwargs["estimation_parameters"] = estimation_parameters
+            common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"][
+                "oml:target_feature"
+            ]
+            common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
+                "oml:estimation_procedure"
+            ]["oml:data_splits_url"]
+
+        cls = {
+            TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+            TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+            TaskType.CLUSTERING: OpenMLClusteringTask,
+            TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+        }.get(task_type)
+        if cls is None:
+            raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.")
+        return cls(**common_kwargs)  # type: ignore
+
+
+class TasksV2(TasksAPI):
+    def get(self, id):
+        pass
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
new file mode 100644
index 000000000..80f35587c
--- /dev/null
+++ b/openml/_api/runtime/core.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from openml._api.config import (
+    API_V1_SERVER,
+    API_V2_SERVER,
+)
+from openml._api.http.client import HTTPClient
+from openml._api.resources import (
+    DatasetsV1,
+    DatasetsV2,
+    TasksV1,
+    TasksV2,
+)
+from openml._api.runtime.fallback import FallbackProxy
+
+
+class APIBackend:
+    def __init__(self, *, datasets, tasks):
+        self.datasets = datasets
+        self.tasks = tasks
+
+
+def build_backend(version: str, strict: bool) -> APIBackend:
+    v1_http = HTTPClient(API_V1_SERVER)
+    v2_http = HTTPClient(API_V2_SERVER)
+
+    v1 = APIBackend(
+        datasets=DatasetsV1(v1_http),
+        tasks=TasksV1(v1_http),
+    )
+
+    if version == "v1":
+        return v1
+
+    v2 = APIBackend(
+        datasets=DatasetsV2(v2_http),
+        tasks=TasksV2(v2_http),
+    )
+
+    if strict:
+        return v2
+
+    return APIBackend(
+        datasets=FallbackProxy(v2.datasets, v1.datasets),
+        tasks=FallbackProxy(v2.tasks, v1.tasks),
+    )
+
+
+class APIContext:
+    def __init__(self):
+        self._backend = build_backend("v1", strict=False)
+
+    def set_version(self, version: str, strict: bool = False):
+        self._backend = build_backend(version, strict)
+
+    @property
+    def backend(self):
+        return self._backend
diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py
new file mode 100644
index 000000000..56e96a966
--- /dev/null
+++ b/openml/_api/runtime/fallback.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+
+class FallbackProxy:
+    pass
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index d2bf5e946..91be65965 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -12,6 +12,7 @@
 
 import openml._api_calls
 import openml.utils
+from openml._api import api_context
 from openml.datasets import get_dataset
 from openml.exceptions import OpenMLCacheException
 
@@ -442,11 +443,12 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get")
+        task, response = api_context.backend.tasks.get(task_id, return_response=True)
 
         with xml_file.open("w", encoding="utf8") as fh:
-            fh.write(task_xml)
-        return _create_task_from_xml(task_xml)
+            fh.write(response.text)
+
+        return task
 
 
 def _create_task_from_xml(xml: str) -> OpenMLTask:

From 52ef37999fad8509e5e85b8512e442bd9dc69e04 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 5 Jan 2026 12:48:58 +0500
Subject: [PATCH 02/10] fix pre-commit

---
 openml/_api/__init__.py           |  2 +-
 openml/_api/http/__init__.py      |  2 ++
 openml/_api/http/client.py        | 32 +++++++++++++++++++++++--------
 openml/_api/resources/__init__.py |  2 ++
 openml/_api/resources/base.py     | 13 +++++++++++--
 openml/_api/resources/datasets.py | 15 +++++++++++----
 openml/_api/resources/tasks.py    | 25 +++++++++++++++++++-----
 openml/_api/runtime/__init__.py   |  0
 openml/_api/runtime/core.py       | 23 +++++++++++-----------
 openml/_api/runtime/fallback.py   |  9 ++++++++-
 openml/tasks/functions.py         | 12 ++++++++----
 11 files changed, 99 insertions(+), 36 deletions(-)
 create mode 100644 openml/_api/runtime/__init__.py

diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py
index 5089f94dd..881f40671 100644
--- a/openml/_api/__init__.py
+++ b/openml/_api/__init__.py
@@ -1,7 +1,7 @@
 from openml._api.runtime.core import APIContext
 
 
-def set_api_version(version: str, strict=False):
+def set_api_version(version: str, *, strict: bool = False) -> None:
     api_context.set_version(version=version, strict=strict)
 
 
diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py
index fde2a5b0a..8e6d1e4ce 100644
--- a/openml/_api/http/__init__.py
+++ b/openml/_api/http/__init__.py
@@ -1 +1,3 @@
 from openml._api.http.client import HTTPClient
+
+__all__ = ["HTTPClient"]
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 81a9213e3..dea5de809 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,23 +1,39 @@
 from __future__ import annotations
 
+from typing import Any, Mapping
+
 import requests
+from requests import Response
 
 from openml.__version__ import __version__
 
 
 class HTTPClient:
-    def __init__(self, base_url: str):
+    def __init__(self, base_url: str) -> None:
         self.base_url = base_url
-        self.headers = {"user-agent": f"openml-python/{__version__}"}
+        self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
-    def get(self, path, params=None):
+    def get(
+        self,
+        path: str,
+        params: Mapping[str, Any] | None = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.get(url, params=params, headers=self.headers)
+        return requests.get(url, params=params, headers=self.headers, timeout=10)
 
-    def post(self, path, data=None, files=None):
+    def post(
+        self,
+        path: str,
+        data: Mapping[str, Any] | None = None,
+        files: Any = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.post(url, data=data, files=files, headers=self.headers)
+        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
 
-    def delete(self, path, params=None):
+    def delete(
+        self,
+        path: str,
+        params: Mapping[str, Any] | None = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.delete(url, params=params, headers=self.headers)
+        return requests.delete(url, params=params, headers=self.headers, timeout=10)
diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py
index 078fc5998..b1af3c1a8 100644
--- a/openml/_api/resources/__init__.py
+++ b/openml/_api/resources/__init__.py
@@ -1,2 +1,4 @@
 from openml._api.resources.datasets import DatasetsV1, DatasetsV2
 from openml._api.resources.tasks import TasksV1, TasksV2
+
+__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"]
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 1fae27665..6fbf8977d 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -4,7 +4,11 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from requests import Response
+
     from openml._api.http import HTTPClient
+    from openml.datasets.dataset import OpenMLDataset
+    from openml.tasks.task import OpenMLTask
 
 
 class ResourceAPI:
@@ -14,9 +18,14 @@ def __init__(self, http: HTTPClient):
 
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, id: int) -> dict: ...
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, id: int) -> dict: ...
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]: ...
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index cd1bb595a..9ff1ec278 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,13 +1,20 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from openml._api.resources.base import DatasetsAPI
 
+if TYPE_CHECKING:
+    from responses import Response
+
+    from openml.datasets.dataset import OpenMLDataset
+
 
 class DatasetsV1(DatasetsAPI):
-    def get(self, id):
-        pass
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        raise NotImplementedError
 
 
 class DatasetsV2(DatasetsAPI):
-    def get(self, id):
-        pass
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        raise NotImplementedError
diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py
index b0e9afbf8..f494fb9a3 100644
--- a/openml/_api/resources/tasks.py
+++ b/openml/_api/resources/tasks.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import xmltodict
 
 from openml._api.resources.base import TasksAPI
@@ -12,12 +14,20 @@
     TaskType,
 )
 
+if TYPE_CHECKING:
+    from requests import Response
+
 
 class TasksV1(TasksAPI):
-    def get(self, id, return_response=False):
-        path = f"task/{id}"
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]:
+        path = f"task/{task_id}"
         response = self._http.get(path)
-        xml_content = response.content
+        xml_content = response.text
         task = self._create_task_from_xml(xml_content)
 
         if return_response:
@@ -109,5 +119,10 @@ def _create_task_from_xml(self, xml: str) -> OpenMLTask:
 
 
 class TasksV2(TasksAPI):
-    def get(self, id):
-        pass
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]:
+        raise NotImplementedError
diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
index 80f35587c..aa09a69db 100644
--- a/openml/_api/runtime/core.py
+++ b/openml/_api/runtime/core.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from openml._api.config import (
     API_V1_SERVER,
     API_V2_SERVER,
@@ -11,16 +13,18 @@
     TasksV1,
     TasksV2,
 )
-from openml._api.runtime.fallback import FallbackProxy
+
+if TYPE_CHECKING:
+    from openml._api.resources.base import DatasetsAPI, TasksAPI
 
 
 class APIBackend:
-    def __init__(self, *, datasets, tasks):
+    def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI):
         self.datasets = datasets
         self.tasks = tasks
 
 
-def build_backend(version: str, strict: bool) -> APIBackend:
+def build_backend(version: str, *, strict: bool) -> APIBackend:
     v1_http = HTTPClient(API_V1_SERVER)
     v2_http = HTTPClient(API_V2_SERVER)
 
@@ -40,19 +44,16 @@ def build_backend(version: str, strict: bool) -> APIBackend:
     if strict:
         return v2
 
-    return APIBackend(
-        datasets=FallbackProxy(v2.datasets, v1.datasets),
-        tasks=FallbackProxy(v2.tasks, v1.tasks),
-    )
+    return v1
 
 
 class APIContext:
-    def __init__(self):
+    def __init__(self) -> None:
         self._backend = build_backend("v1", strict=False)
 
-    def set_version(self, version: str, strict: bool = False):
-        self._backend = build_backend(version, strict)
+    def set_version(self, version: str, *, strict: bool = False) -> None:
+        self._backend = build_backend(version=version, strict=strict)
 
     @property
-    def backend(self):
+    def backend(self) -> APIBackend:
         return self._backend
diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py
index 56e96a966..1bc99d270 100644
--- a/openml/_api/runtime/fallback.py
+++ b/openml/_api/runtime/fallback.py
@@ -1,5 +1,12 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from openml._api.resources.base import ResourceAPI
+
 
 class FallbackProxy:
-    pass
+    def __init__(self, primary: ResourceAPI, fallback: ResourceAPI):
+        self._primary = primary
+        self._fallback = fallback
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index ef67f75bf..a794ad56d 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -445,10 +445,14 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        task, response = api_context.backend.tasks.get(task_id, return_response=True)
-
-        with xml_file.open("w", encoding="utf8") as fh:
-            fh.write(response.text)
+        result = api_context.backend.tasks.get(task_id, return_response=True)
+
+        if isinstance(result, tuple):
+            task, response = result
+            with xml_file.open("w", encoding="utf8") as fh:
+                fh.write(response.text)
+        else:
+            task = result
 
         return task
 

From f7ba710a9a3c457ec7c48ec45fa174c9194eeb98 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Tue, 6 Jan 2026 16:24:35 +0530
Subject: [PATCH 03/10] Merge base migration pr, ruff

---
 openml/_api/http/client.py        |   5 +-
 openml/_api/resources/base.py     |  70 ++++-
 openml/_api/resources/datasets.py | 440 +++++++++++++++++++++++++++++-
 3 files changed, 504 insertions(+), 11 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index dea5de809..b0d3c911f 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -25,10 +25,13 @@ def post(
         self,
         path: str,
         data: Mapping[str, Any] | None = None,
+        json: dict | None = None,
         files: Any = None,
     ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
+        return requests.post(
+            url, data=data, json=json, files=files, headers=self.headers, timeout=10
+        )
 
     def delete(
         self,
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 6fbf8977d..9d480b06a 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    import pandas as pd
     from requests import Response
 
     from openml._api.http import HTTPClient
@@ -18,7 +19,74 @@ def __init__(self, http: HTTPClient):
 
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
+    def get(
+        self, dataset_id: int, *, return_response: bool
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
+
+    @abstractmethod
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame: ...
+
+    def _name_to_id(
+        self,
+        dataset_name: str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT001, FBT002
+    ) -> int:
+        """Attempt to find the dataset id of the dataset with the given name.
+
+        If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
+        then return the least recent still active dataset.
+
+        Raises an error if no dataset with the name is found.
+        Raises an error if a version is specified but it could not be found.
+
+        Parameters
+        ----------
+        dataset_name : str
+            The name of the dataset for which to find its id.
+        version : int, optional
+            Version to retrieve. If not specified, the oldest active version is returned.
+        error_if_multiple : bool (default=False)
+            If `False`, if multiple datasets match, return the least recent active dataset.
+            If `True`, if multiple datasets match, raise an error.
+        download_qualities : bool, optional (default=True)
+            If `True`, also download qualities.xml file. If False it skip the qualities.xml.
+
+        Returns
+        -------
+        int
+        The id of the dataset.
+        """
+        status = None if version is not None else "active"
+        candidates = self.list(
+            data_name=dataset_name,
+            status=status,
+            data_version=version,
+        )
+        if error_if_multiple and len(candidates) > 1:
+            msg = f"Multiple active datasets exist with name '{dataset_name}'."
+            raise ValueError(msg)
+
+        if candidates.empty:
+            no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+            and_version = f" and version '{version}'." if version is not None else "."
+            raise RuntimeError(no_dataset_for_name + and_version)
+
+        # Dataset ids are chronological so we can just sort based on ids (instead of version)
+        return candidates["did"].min()  # type: ignore
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 9ff1ec278..f985cd75a 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,20 +1,442 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-from openml._api.resources.base import DatasetsAPI
+from functools import partial
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
-    from responses import Response
+    from requests import Response
 
-    from openml.datasets.dataset import OpenMLDataset
+import pandas as pd
+import xmltodict
+
+import openml.utils
+from openml._api.resources.base import DatasetsAPI
+from openml.datasets.dataset import OpenMLDataset
 
 
 class DatasetsV1(DatasetsAPI):
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        raise NotImplementedError
+    def get(
+        self, dataset_id: int, *, return_response: bool = False
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        path = f"data/{dataset_id}"
+        response = self._http.get(path)
+        xml_content = response.text  # .text returns str, .content returns bytes
+        dataset = self._create_dataset_from_xml(xml_content)
+
+        if return_response:
+            return dataset, response
+
+        return dataset
+
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame:
+        """Return a dataframe of all dataset which are on OpenML.
+
+        Supports large amount of results.
+
+        Parameters
+        ----------
+        data_id : list, optional
+            A list of data ids, to specify which datasets should be
+            listed
+        offset : int, optional
+            The number of datasets to skip, starting from the first.
+        size : int, optional
+            The maximum number of datasets to show.
+        status : str, optional
+            Should be {active, in_preparation, deactivated}. By
+            default active datasets are returned, but also datasets
+            from another status can be requested.
+        tag : str, optional
+        data_name : str, optional
+        data_version : int, optional
+        number_instances : int | str, optional
+        number_features : int | str, optional
+        number_classes : int | str, optional
+        number_missing_values : int | str, optional
+
+        Returns
+        -------
+        datasets: dataframe
+            Each row maps to a dataset
+            Each column contains the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also included as columns.
+        """
+        listing_call = partial(
+            self._list_datasets,
+            data_id=data_id,
+            status=status,
+            tag=tag,
+            data_name=data_name,
+            data_version=data_version,
+            number_instances=number_instances,
+            number_features=number_features,
+            number_classes=number_classes,
+            number_missing_values=number_missing_values,
+        )
+        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+        if len(batches) == 0:
+            return pd.DataFrame()
+
+        return pd.concat(batches)
+
+    def _list_datasets(
+        self,
+        limit: int,
+        offset: int,
+        *,
+        data_id: list[int] | None = None,  # type: ignore
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Perform api call to return a list of all datasets.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
+            The maximum number of datasets to show.
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values.
+
+        Returns
+        -------
+        datasets : dataframe
+        """
+        api_call = "data/list"
+
+        if limit is not None:
+            api_call += f"/limit/{limit}"
+        if offset is not None:
+            api_call += f"/offset/{offset}"
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    api_call += f"/{operator}/{value}"
+        if data_id is not None:
+            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
+        return self.__list_datasets(api_call=api_call)
+
+    def __list_datasets(self, api_call: str) -> pd.DataFrame:
+        xml_string = self._http.get(api_call).text
+        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+
+        # Minimalistic check if the XML is useful
+        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+            datasets_dict["oml:data"],
+        )
+        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+            "oml:data"
+        ]["@xmlns:oml"]
+
+        datasets = {}
+        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+            ignore_attribute = ["oml:file_id", "oml:quality"]
+            dataset = {
+                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+            }
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("oml:quality", []):
+                try:
+                    dataset[quality["@name"]] = int(quality["#text"])
+                except ValueError:
+                    dataset[quality["@name"]] = float(quality["#text"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
+
+    def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
+        """Create a dataset given a xml string.
+
+        Parameters
+        ----------
+        xml : string
+            Dataset xml representation.
+
+        Returns
+        -------
+        OpenMLDataset
+        """
+        description = xmltodict.parse(xml)["oml:data_set_description"]
+
+        # TODO file path after download, cache_format default = 'pickle'
+        arff_file = None
+        features_file = None
+        parquet_file = None
+        qualities_file = None
+
+        return OpenMLDataset(
+            description["oml:name"],
+            description.get("oml:description"),
+            data_format=description["oml:format"],
+            dataset_id=int(description["oml:id"]),
+            version=int(description["oml:version"]),
+            creator=description.get("oml:creator"),
+            contributor=description.get("oml:contributor"),
+            collection_date=description.get("oml:collection_date"),
+            upload_date=description.get("oml:upload_date"),
+            language=description.get("oml:language"),
+            licence=description.get("oml:licence"),
+            url=description["oml:url"],
+            default_target_attribute=description.get("oml:default_target_attribute"),
+            row_id_attribute=description.get("oml:row_id_attribute"),
+            ignore_attribute=description.get("oml:ignore_attribute"),
+            version_label=description.get("oml:version_label"),
+            citation=description.get("oml:citation"),
+            tag=description.get("oml:tag"),
+            visibility=description.get("oml:visibility"),
+            original_data_url=description.get("oml:original_data_url"),
+            paper_url=description.get("oml:paper_url"),
+            update_comment=description.get("oml:update_comment"),
+            md5_checksum=description.get("oml:md5_checksum"),
+            data_file=str(arff_file) if arff_file is not None else None,
+            features_file=str(features_file) if features_file is not None else None,
+            qualities_file=str(qualities_file) if qualities_file is not None else None,
+            parquet_url=description.get("oml:parquet_url"),
+            parquet_file=str(parquet_file) if parquet_file is not None else None,
+        )
 
 
 class DatasetsV2(DatasetsAPI):
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        raise NotImplementedError
+    def get(
+        self, dataset_id: int, *, return_response: bool = False
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        path = f"datasets/{dataset_id}"
+        response = self._http.get(path)
+        json_content = response.json()
+        dataset = self._create_dataset_from_json(json_content)
+
+        if return_response:
+            return dataset, response
+
+        return dataset
+
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame:
+        """Return a dataframe of all dataset which are on OpenML.
+
+        Supports large amount of results.
+
+        Parameters
+        ----------
+        data_id : list, optional
+            A list of data ids, to specify which datasets should be
+            listed
+        offset : int, optional
+            The number of datasets to skip, starting from the first.
+        size : int, optional
+            The maximum number of datasets to show.
+        status : str, optional
+            Should be {active, in_preparation, deactivated}. By
+            default active datasets are returned, but also datasets
+            from another status can be requested.
+        tag : str, optional
+        data_name : str, optional
+        data_version : int, optional
+        number_instances : int | str, optional
+        number_features : int | str, optional
+        number_classes : int | str, optional
+        number_missing_values : int | str, optional
+
+        Returns
+        -------
+        datasets: dataframe
+            Each row maps to a dataset
+            Each column contains the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also included as columns.
+        """
+        listing_call = partial(
+            self._list_datasets,
+            data_id=data_id,
+            status=status,
+            tag=tag,
+            data_name=data_name,
+            data_version=data_version,
+            number_instances=number_instances,
+            number_features=number_features,
+            number_classes=number_classes,
+            number_missing_values=number_missing_values,
+        )
+        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+        if len(batches) == 0:
+            return pd.DataFrame()
+
+        return pd.concat(batches)
+
+    def _list_datasets(
+        self,
+        limit: int,
+        offset: int,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Perform api call to return a list of all datasets.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
+            The maximum number of datasets to show.
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values, data_id.
+
+        Returns
+        -------
+        datasets : dataframe
+        """
+        json: dict[str, Any] = {"pagination": {}}
+
+        if limit is not None:
+            json["pagination"]["limit"] = limit
+        if offset is not None:
+            json["pagination"]["offset"] = offset
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    json[operator] = value
+
+        return self.__list_datasets(json=json)
+
+    def __list_datasets(self, json: dict) -> pd.DataFrame:
+        api_call = "datasets/list"
+        datasets_list = self._http.post(api_call, json=json).json()
+
+        # Minimalistic check if the JSON is useful
+        assert isinstance(datasets_list, list), type(datasets_list)
+
+        datasets = {}
+        for dataset_ in datasets_list:
+            ignore_attribute = ["file_id", "quality"]
+            dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute}
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("quality", []):
+                try:
+                    dataset[quality["name"]] = int(quality["text"])
+                except ValueError:
+                    dataset[quality["name"]] = float(quality["text"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
+
+    def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
+        """Create a dataset given a json.
+
+        Parameters
+        ----------
+        json_content : dict
+            Dataset dict/json representation.
+
+        Returns
+        -------
+        OpenMLDataset
+        """
+        # TODO file path after download, cache_format default = 'pickle'
+        arff_file = None
+        features_file = None
+        parquet_file = None
+        qualities_file = None
+
+        return OpenMLDataset(
+            json_content["name"],
+            json_content.get("description"),
+            data_format=json_content["format"],
+            dataset_id=int(json_content["id"]),
+            version=int(json_content["version"]),
+            creator=json_content.get("creator"),
+            contributor=json_content.get("contributor"),
+            collection_date=json_content.get("collection_date"),
+            upload_date=json_content.get("upload_date"),
+            language=json_content.get("language"),
+            licence=json_content.get("licence"),
+            url=json_content["url"],
+            default_target_attribute=json_content.get("default_target_attribute"),
+            row_id_attribute=json_content.get("row_id_attribute"),
+            ignore_attribute=json_content.get("ignore_attribute"),
+            version_label=json_content.get("version_label"),
+            citation=json_content.get("citation"),
+            tag=json_content.get("tag"),
+            visibility=json_content.get("visibility"),
+            original_data_url=json_content.get("original_data_url"),
+            paper_url=json_content.get("paper_url"),
+            update_comment=json_content.get("update_comment"),
+            md5_checksum=json_content.get("md5_checksum"),
+            data_file=str(arff_file) if arff_file is not None else None,
+            features_file=str(features_file) if features_file is not None else None,
+            qualities_file=str(qualities_file) if qualities_file is not None else None,
+            parquet_url=json_content.get("parquet_url"),
+            parquet_file=str(parquet_file) if parquet_file is not None else None,
+        )

From 5dfcbce55a027d19cd502ea7bb3d521c2b1bca29 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 22:14:31 +0500
Subject: [PATCH 04/10] refactor

---
 openml/_api/config.py       | 62 +++++++++++++++++++++++++++++++++++--
 openml/_api/http/client.py  | 18 +++++++----
 openml/_api/runtime/core.py |  9 ++----
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/openml/_api/config.py b/openml/_api/config.py
index bd93c3cad..1431f66b1 100644
--- a/openml/_api/config.py
+++ b/openml/_api/config.py
@@ -1,5 +1,61 @@
 from __future__ import annotations
 
-API_V1_SERVER = "https://www.openml.org/api/v1/xml"
-API_V2_SERVER = "http://127.0.0.1:8001"
-API_KEY = "..."
+from dataclasses import dataclass
+from typing import Literal
+
+DelayMethod = Literal["human", "robot"]
+
+
+@dataclass
+class APIConfig:
+    server: str
+    base_url: str
+    key: str
+
+
+@dataclass
+class APISettings:
+    v1: APIConfig
+    v2: APIConfig
+
+
+@dataclass
+class ConnectionConfig:
+    retries: int = 3
+    delay_method: DelayMethod = "human"
+    delay_time: int = 1  # seconds
+
+    def __post_init__(self) -> None:
+        if self.delay_method not in ("human", "robot"):
+            raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}")
+
+
+@dataclass
+class CacheConfig:
+    dir: str = "~/.openml/cache"
+    ttl: int = 60 * 60 * 24 * 7  # one week
+
+
+@dataclass
+class Settings:
+    api: APISettings
+    connection: ConnectionConfig
+    cache: CacheConfig
+
+
+settings = Settings(
+    api=APISettings(
+        v1=APIConfig(
+            server="https://www.openml.org/",
+            base_url="api/v1/xml/",
+            key="...",
+        ),
+        v2=APIConfig(
+            server="http://127.0.0.1:8001/",
+            base_url="",
+            key="...",
+        ),
+    ),
+    connection=ConnectionConfig(),
+    cache=CacheConfig(),
+)
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index dea5de809..74e08c709 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,24 +1,30 @@
 from __future__ import annotations
 
-from typing import Any, Mapping
+from typing import TYPE_CHECKING, Any, Mapping
 
 import requests
 from requests import Response
 
 from openml.__version__ import __version__
 
+if TYPE_CHECKING:
+    from openml._api.config import APIConfig
+
 
 class HTTPClient:
-    def __init__(self, base_url: str) -> None:
-        self.base_url = base_url
+    def __init__(self, config: APIConfig) -> None:
+        self.config = config
         self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
+    def _create_url(self, path: str) -> str:
+        return self.config.server + self.config.base_url + path
+
     def get(
         self,
         path: str,
         params: Mapping[str, Any] | None = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.get(url, params=params, headers=self.headers, timeout=10)
 
     def post(
@@ -27,7 +33,7 @@ def post(
         data: Mapping[str, Any] | None = None,
         files: Any = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
 
     def delete(
@@ -35,5 +41,5 @@ def delete(
         path: str,
         params: Mapping[str, Any] | None = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.delete(url, params=params, headers=self.headers, timeout=10)
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
index aa09a69db..98b587411 100644
--- a/openml/_api/runtime/core.py
+++ b/openml/_api/runtime/core.py
@@ -2,10 +2,7 @@
 
 from typing import TYPE_CHECKING
 
-from openml._api.config import (
-    API_V1_SERVER,
-    API_V2_SERVER,
-)
+from openml._api.config import settings
 from openml._api.http.client import HTTPClient
 from openml._api.resources import (
     DatasetsV1,
@@ -25,8 +22,8 @@ def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI):
 
 
 def build_backend(version: str, *, strict: bool) -> APIBackend:
-    v1_http = HTTPClient(API_V1_SERVER)
-    v2_http = HTTPClient(API_V2_SERVER)
+    v1_http = HTTPClient(config=settings.api.v1)
+    v2_http = HTTPClient(config=settings.api.v2)
 
     v1 = APIBackend(
         datasets=DatasetsV1(v1_http),

From 2acbe9992cf95bfc103ff4fa0c360a58c1842870 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 22:24:03 +0500
Subject: [PATCH 05/10] implement cache_dir

---
 openml/_api/http/client.py | 74 +++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 74e08c709..49b05c88e 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,36 +1,93 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Mapping
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from urllib.parse import urlencode, urljoin, urlparse
 
 import requests
 from requests import Response
 
 from openml.__version__ import __version__
+from openml._api.config import settings
 
 if TYPE_CHECKING:
     from openml._api.config import APIConfig
 
 
-class HTTPClient:
+class CacheMixin:
+    @property
+    def dir(self) -> str:
+        return settings.cache.dir
+
+    @property
+    def ttl(self) -> int:
+        return settings.cache.ttl
+
+    def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
+        parsed_url = urlparse(url)
+        netloc_parts = parsed_url.netloc.split(".")[::-1]  # reverse domain
+        path_parts = parsed_url.path.strip("/").split("/")
+
+        # remove api_key and serialize params if any
+        filtered_params = {k: v for k, v in params.items() if k != "api_key"}
+        params_part = [urlencode(filtered_params)] if filtered_params else []
+
+        return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)
+
+    def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None:  # noqa: ARG002
+        return None
+
+    def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None:  # noqa: ARG002
+        return None
+
+
+class HTTPClient(CacheMixin):
     def __init__(self, config: APIConfig) -> None:
         self.config = config
         self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
-    def _create_url(self, path: str) -> str:
-        return self.config.server + self.config.base_url + path
+    @property
+    def server(self) -> str:
+        return self.config.server
+
+    @property
+    def base_url(self) -> str:
+        return self.config.base_url
+
+    def _create_url(self, path: str) -> Any:
+        return urljoin(self.server, urljoin(self.base_url, path))
 
     def get(
         self,
         path: str,
-        params: Mapping[str, Any] | None = None,
+        *,
+        params: dict[str, Any] | None = None,
+        use_cache: bool = False,
+        use_api_key: bool = False,
     ) -> Response:
         url = self._create_url(path)
-        return requests.get(url, params=params, headers=self.headers, timeout=10)
+        params = dict(params) if params is not None else {}
+
+        if use_api_key:
+            params["api_key"] = self.config.key
+
+        if use_cache:
+            response = self._get_cache_response(url, params)
+            if response:
+                return response
+
+        response = requests.get(url, params=params, headers=self.headers, timeout=10)
+
+        if use_cache:
+            self._set_cache_response(url, params, response)
+
+        return response
 
     def post(
         self,
         path: str,
-        data: Mapping[str, Any] | None = None,
+        *,
+        data: dict[str, Any] | None = None,
         files: Any = None,
     ) -> Response:
         url = self._create_url(path)
@@ -39,7 +96,8 @@ def post(
     def delete(
         self,
         path: str,
-        params: Mapping[str, Any] | None = None,
+        *,
+        params: dict[str, Any] | None = None,
     ) -> Response:
         url = self._create_url(path)
         return requests.delete(url, params=params, headers=self.headers, timeout=10)

From af99880a9e16a49833c63084c9e9267c112b6b91 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 23:42:17 +0500
Subject: [PATCH 06/10] refactor

---
 openml/_api/config.py      |   1 +
 openml/_api/http/client.py | 100 +++++++++++++++++++++++++++----------
 2 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/openml/_api/config.py b/openml/_api/config.py
index 1431f66b1..848fe8da1 100644
--- a/openml/_api/config.py
+++ b/openml/_api/config.py
@@ -11,6 +11,7 @@ class APIConfig:
     server: str
     base_url: str
     key: str
+    timeout: int = 10  # seconds
 
 
 @dataclass
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 49b05c88e..a90e93933 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -23,7 +23,7 @@ def dir(self) -> str:
     def ttl(self) -> int:
         return settings.cache.ttl
 
-    def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
+    def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path:
         parsed_url = urlparse(url)
         netloc_parts = parsed_url.netloc.split(".")[::-1]  # reverse domain
         path_parts = parsed_url.path.strip("/").split("/")
@@ -34,10 +34,10 @@ def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
 
         return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)
 
-    def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None:  # noqa: ARG002
-        return None
+    def _get_cache_response(self, cache_dir: Path) -> Response:  # noqa: ARG002
+        return Response()
 
-    def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None:  # noqa: ARG002
+    def _set_cache_response(self, cache_dir: Path, response: Response) -> None:  # noqa: ARG002
         return None
 
 
@@ -54,50 +54,98 @@ def server(self) -> str:
     def base_url(self) -> str:
         return self.config.base_url
 
-    def _create_url(self, path: str) -> Any:
-        return urljoin(self.server, urljoin(self.base_url, path))
+    @property
+    def key(self) -> str:
+        return self.config.key
 
-    def get(
+    @property
+    def timeout(self) -> int:
+        return self.config.timeout
+
+    def request(
         self,
+        method: str,
         path: str,
         *,
-        params: dict[str, Any] | None = None,
         use_cache: bool = False,
         use_api_key: bool = False,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        params = dict(params) if params is not None else {}
+        url = urljoin(self.server, urljoin(self.base_url, path))
 
+        params = request_kwargs.pop("params", {})
+        params = params.copy()
         if use_api_key:
-            params["api_key"] = self.config.key
+            params["api_key"] = self.key
 
-        if use_cache:
-            response = self._get_cache_response(url, params)
-            if response:
-                return response
+        headers = request_kwargs.pop("headers", {})
+        headers = headers.copy()
+        headers.update(self.headers)
+
+        timeout = request_kwargs.pop("timeout", self.timeout)
+        cache_dir = self._get_cache_dir(url, params)
 
-        response = requests.get(url, params=params, headers=self.headers, timeout=10)
+        if use_cache:
+            try:
+                return self._get_cache_response(cache_dir)
+            # TODO: handle ttl expired error
+            except Exception:
+                raise
+
+        response = requests.request(
+            method=method,
+            url=url,
+            params=params,
+            headers=headers,
+            timeout=timeout,
+            **request_kwargs,
+        )
 
         if use_cache:
-            self._set_cache_response(url, params, response)
+            self._set_cache_response(cache_dir, response)
 
         return response
 
-    def post(
+    def get(
         self,
         path: str,
         *,
-        data: dict[str, Any] | None = None,
-        files: Any = None,
+        use_cache: bool = False,
+        use_api_key: bool = False,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
+        # TODO: remove override when cache is implemented
+        use_cache = False
+        return self.request(
+            method="GET",
+            path=path,
+            use_cache=use_cache,
+            use_api_key=use_api_key,
+            **request_kwargs,
+        )
+
+    def post(
+        self,
+        path: str,
+        **request_kwargs: Any,
+    ) -> Response:
+        return self.request(
+            method="POST",
+            path=path,
+            use_cache=False,
+            use_api_key=True,
+            **request_kwargs,
+        )
 
     def delete(
         self,
         path: str,
-        *,
-        params: dict[str, Any] | None = None,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        return requests.delete(url, params=params, headers=self.headers, timeout=10)
+        return self.request(
+            method="DELETE",
+            path=path,
+            use_cache=False,
+            use_api_key=True,
+            **request_kwargs,
+        )

From 8964517d5fa9b656dc1473adfc09e9a56c524073 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Thu, 8 Jan 2026 09:14:49 +0530
Subject: [PATCH 07/10] edit, fork, delete updated

---
 openml/_api/resources/base.py     |  36 +++-
 openml/_api/resources/datasets.py | 278 +++++++++++++++++++++++++++++-
 2 files changed, 304 insertions(+), 10 deletions(-)

diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 9d480b06a..5a74239d1 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -2,6 +2,7 @@
 
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
+from typing_extensions import Literal
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -20,13 +21,18 @@ def __init__(self, http: HTTPClient):
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
     def get(
-        self, dataset_id: int, *, return_response: bool
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
     @abstractmethod
     def list(  # noqa: PLR0913
         self,
-        data_id: list[int] | None = None,
+        data_id: list[int] | None = None,  # type: ignore
         offset: int | None = None,
         size: int | None = None,
         status: str | None = None,
@@ -39,6 +45,32 @@ def list(  # noqa: PLR0913
         number_missing_values: int | str | None = None,
     ) -> pd.DataFrame: ...
 
+    @abstractmethod
+    def delete(self, dataset_id: int) -> bool: ...
+
+    @abstractmethod
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int: ...
+
+    @abstractmethod
+    def fork(self, data_id: int) -> int: ...
+
+    @abstractmethod
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ...
+
     def _name_to_id(
         self,
         dataset_name: str,
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index f985cd75a..5414fba43 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+from collections import OrderedDict
 from functools import partial
 from typing import TYPE_CHECKING, Any
+from typing_extensions import Literal
 
 if TYPE_CHECKING:
     from requests import Response
@@ -16,11 +18,23 @@
 
 class DatasetsV1(DatasetsAPI):
     def get(
-        self, dataset_id: int, *, return_response: bool = False
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        path = f"data/{dataset_id}"
+        if isinstance(dataset_id, int):
+            resolved_id = dataset_id
+        elif dataset_id.isdigit():
+            resolved_id = int(dataset_id)
+        else:
+            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
+
+        path = f"data/{resolved_id}"
         response = self._http.get(path)
-        xml_content = response.text  # .text returns str, .content returns bytes
+        xml_content = response.text
         dataset = self._create_dataset_from_xml(xml_content)
 
         if return_response:
@@ -97,6 +111,194 @@ def list(  # noqa: PLR0913
 
         return pd.concat(batches)
 
+    def delete(self, dataset_id: int) -> bool:
+        """Delete dataset with id `dataset_id` from the OpenML server.
+
+        This can only be done if you are the owner of the dataset and
+        no tasks are attached to the dataset.
+
+        Parameters
+        ----------
+        dataset_id : int
+            OpenML id of the dataset
+
+        Returns
+        -------
+        bool
+            True if the deletion was successful. False otherwise.
+        """
+        return openml.utils._delete_entity("data", dataset_id)
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        """Edits an OpenMLDataset.
+
+        In addition to providing the dataset id of the dataset to edit (through data_id),
+        you must specify a value for at least one of the optional function arguments,
+        i.e. one value for a field to edit.
+
+        This function allows editing of both non-critical and critical fields.
+        Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.
+
+        - Editing non-critical data fields is allowed for all authenticated users.
+        - Editing critical fields is allowed only for the owner, provided there are no tasks
+        associated with this dataset.
+
+        If dataset has tasks or if the user is not the owner, the only way
+        to edit critical fields is to use fork_dataset followed by edit_dataset.
+
+        Parameters
+        ----------
+        data_id : int
+            ID of the dataset.
+        description : str
+            Description of the dataset.
+        creator : str
+            The person who created the dataset.
+        contributor : str
+            People who contributed to the current version of the dataset.
+        collection_date : str
+            The date the data was originally collected, given by the uploader.
+        language : str
+            Language in which the data is represented.
+            Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+        default_target_attribute : str
+            The default target attribute, if it exists.
+            Can have multiple values, comma separated.
+        ignore_attribute : str | list
+            Attributes that should be excluded in modelling,
+            such as identifiers and indexes.
+        citation : str
+            Reference(s) that should be cited when building on this data.
+        row_id_attribute : str, optional
+            The attribute that represents the row-id column, if present in the
+            dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+            specified, the index of the dataframe will be used as the
+            ``row_id_attribute``. If the name of the index is ``None``, it will
+            be discarded.
+
+            .. versionadded: 0.8
+                Inference of ``row_id_attribute`` from a dataframe.
+        original_data_url : str, optional
+            For derived data, the url to the original dataset.
+        paper_url : str, optional
+            Link to a paper describing the dataset.
+
+        Returns
+        -------
+        Dataset id
+        """
+        if not isinstance(data_id, int):
+            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+
+        # compose data edit parameters as xml
+        form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
+        xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+        xml["oml:data_edit_parameters"] = OrderedDict()
+        xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+        xml["oml:data_edit_parameters"]["oml:description"] = description
+        xml["oml:data_edit_parameters"]["oml:creator"] = creator
+        xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+        xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+        xml["oml:data_edit_parameters"]["oml:language"] = language
+        xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+        xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+        xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
+        xml["oml:data_edit_parameters"]["oml:citation"] = citation
+        xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+        xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+        # delete None inputs
+        for k in list(xml["oml:data_edit_parameters"]):
+            if not xml["oml:data_edit_parameters"][k]:
+                del xml["oml:data_edit_parameters"][k]
+
+        file_elements = {
+            "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
+        }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
+        result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_edit"]["oml:id"]
+        return int(data_id)
+
+    def fork(self, data_id: int) -> int:
+        """
+        Creates a new dataset version, with the authenticated user as the new owner.
+        The forked dataset can have distinct dataset meta-data,
+        but the actual data itself is shared with the original version.
+
+        This API is intended for use when a user is unable to edit the critical fields of a dataset
+        through the edit_dataset API.
+        (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.)
+
+        Specifically, this happens when the user is:
+                1. Not the owner of the dataset.
+                2. User is the owner of the dataset, but the dataset has tasks.
+
+        In these two cases the only way to edit critical fields is:
+                1. STEP 1: Fork the dataset using fork_dataset API
+                2. STEP 2: Call edit_dataset API on the forked version.
+
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to be forked
+
+        Returns
+        -------
+        Dataset id of the forked dataset
+
+        """
+        if not isinstance(data_id, int):
+            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+        # compose data fork parameters
+        form_data = {"data_id": data_id}
+        result_xml = self._http.post("data/fork", data=form_data).text
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_fork"]["oml:id"]
+        return int(data_id)
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
+        result_xml = self._http.post("data/status/update", data=data).text
+        result = xmltodict.parse(result_xml)
+        server_data_id = result["oml:data_status_update"]["oml:id"]
+        server_status = result["oml:data_status_update"]["oml:status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
     def _list_datasets(
         self,
         limit: int,
@@ -236,9 +438,21 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
 
 class DatasetsV2(DatasetsAPI):
     def get(
-        self, dataset_id: int, *, return_response: bool = False
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        path = f"datasets/{dataset_id}"
+        if isinstance(dataset_id, int):
+            resolved_id = dataset_id
+        elif dataset_id.isdigit():
+            resolved_id = int(dataset_id)
+        else:
+            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
+
+        path = f"data/{resolved_id}"
         response = self._http.get(path)
         json_content = response.json()
         dataset = self._create_dataset_from_json(json_content)
@@ -317,6 +531,55 @@ def list(  # noqa: PLR0913
 
         return pd.concat(batches)
 
+    def delete(self, dataset_id: int) -> bool:
+        raise NotImplementedError()
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        raise NotImplementedError()
+
+    def fork(self, data_id: int) -> int:
+        raise NotImplementedError()
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
+        result = self._http.post("datasets/status/update", json=data).json()
+        server_data_id = result["dataset_id"]
+        server_status = result["status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
     def _list_datasets(
         self,
         limit: int,
@@ -365,7 +628,6 @@ def _list_datasets(
     def __list_datasets(self, json: dict) -> pd.DataFrame:
         api_call = "datasets/list"
         datasets_list = self._http.post(api_call, json=json).json()
-
         # Minimalistic check if the JSON is useful
         assert isinstance(datasets_list, list), type(datasets_list)
 
@@ -379,9 +641,9 @@ def __list_datasets(self, json: dict) -> pd.DataFrame:
             # The number of qualities can range from 0 to infinity
             for quality in dataset_.get("quality", []):
                 try:
-                    dataset[quality["name"]] = int(quality["text"])
+                    dataset[quality["name"]] = int(quality["value"])
                 except ValueError:
-                    dataset[quality["name"]] = float(quality["text"])
+                    dataset[quality["name"]] = float(quality["value"])
             datasets[dataset["did"]] = dataset
 
         return pd.DataFrame.from_dict(datasets, orient="index").astype(

From 1c2fa9996aa0024af93ab1819877836b6ab803f2 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Thu, 8 Jan 2026 15:57:09 +0530
Subject: [PATCH 08/10] Added features, updated list

---
 openml/_api/resources/base.py     |  76 +----
 openml/_api/resources/datasets.py | 494 +++++++++++++-----------------
 2 files changed, 234 insertions(+), 336 deletions(-)

diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 5a74239d1..990dda998 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
 if TYPE_CHECKING:
@@ -23,26 +23,18 @@ class DatasetsAPI(ResourceAPI, ABC):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
     @abstractmethod
-    def list(  # noqa: PLR0913
+    def list(
         self,
+        limit: int,
+        offset: int,
+        *,
         data_id: list[int] | None = None,  # type: ignore
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
+        **kwargs: Any,
     ) -> pd.DataFrame: ...
 
     @abstractmethod
@@ -71,54 +63,14 @@ def fork(self, data_id: int) -> int: ...
     @abstractmethod
     def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ...
 
-    def _name_to_id(
-        self,
-        dataset_name: str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT001, FBT002
-    ) -> int:
-        """Attempt to find the dataset id of the dataset with the given name.
-
-        If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
-        then return the least recent still active dataset.
-
-        Raises an error if no dataset with the name is found.
-        Raises an error if a version is specified but it could not be found.
-
-        Parameters
-        ----------
-        dataset_name : str
-            The name of the dataset for which to find its id.
-        version : int, optional
-            Version to retrieve. If not specified, the oldest active version is returned.
-        error_if_multiple : bool (default=False)
-            If `False`, if multiple datasets match, return the least recent active dataset.
-            If `True`, if multiple datasets match, raise an error.
-        download_qualities : bool, optional (default=True)
-            If `True`, also download qualities.xml file. If False it skip the qualities.xml.
-
-        Returns
-        -------
-        int
-        The id of the dataset.
-        """
-        status = None if version is not None else "active"
-        candidates = self.list(
-            data_name=dataset_name,
-            status=status,
-            data_version=version,
-        )
-        if error_if_multiple and len(candidates) > 1:
-            msg = f"Multiple active datasets exist with name '{dataset_name}'."
-            raise ValueError(msg)
-
-        if candidates.empty:
-            no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
-            and_version = f" and version '{version}'." if version is not None else "."
-            raise RuntimeError(no_dataset_for_name + and_version)
-
-        # Dataset ids are chronological so we can just sort based on ids (instead of version)
-        return candidates["did"].min()  # type: ignore
+    @abstractmethod
+    def list_qualities(self) -> list[str]: ...  # type: ignore
+
+    @abstractmethod
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
+
+    @abstractmethod
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 5414fba43..845212b20 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from functools import partial
 from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
@@ -20,19 +19,10 @@ class DatasetsV1(DatasetsAPI):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        if isinstance(dataset_id, int):
-            resolved_id = dataset_id
-        elif dataset_id.isdigit():
-            resolved_id = int(dataset_id)
-        else:
-            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
-
-        path = f"data/{resolved_id}"
+        path = f"data/{dataset_id}"
         response = self._http.get(path)
         xml_content = response.text
         dataset = self._create_dataset_from_xml(xml_content)
@@ -42,74 +32,88 @@ def get(
 
         return dataset
 
-    def list(  # noqa: PLR0913
+    def list(
         self,
-        data_id: list[int] | None = None,
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
+        limit: int,
+        offset: int,
+        *,
+        data_id: list[int] | None = None,  # type: ignore
+        **kwargs: Any,
     ) -> pd.DataFrame:
-        """Return a dataframe of all dataset which are on OpenML.
-
-        Supports large amount of results.
+        """
+        Perform api call to return a list of all datasets.
 
         Parameters
         ----------
-        data_id : list, optional
-            A list of data ids, to specify which datasets should be
-            listed
-        offset : int, optional
-            The number of datasets to skip, starting from the first.
-        size : int, optional
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
             The maximum number of datasets to show.
-        status : str, optional
-            Should be {active, in_preparation, deactivated}. By
-            default active datasets are returned, but also datasets
-            from another status can be requested.
-        tag : str, optional
-        data_name : str, optional
-        data_version : int, optional
-        number_instances : int | str, optional
-        number_features : int | str, optional
-        number_classes : int | str, optional
-        number_missing_values : int | str, optional
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values.
 
         Returns
         -------
-        datasets: dataframe
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
+        datasets : dataframe
         """
-        listing_call = partial(
-            self._list_datasets,
-            data_id=data_id,
-            status=status,
-            tag=tag,
-            data_name=data_name,
-            data_version=data_version,
-            number_instances=number_instances,
-            number_features=number_features,
-            number_classes=number_classes,
-            number_missing_values=number_missing_values,
+        api_call = "data/list"
+
+        if limit is not None:
+            api_call += f"/limit/{limit}"
+        if offset is not None:
+            api_call += f"/offset/{offset}"
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    api_call += f"/{operator}/{value}"
+        if data_id is not None:
+            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
+
+        xml_string = self._http.get(api_call).text
+        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+
+        # Minimalistic check if the XML is useful
+        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+            datasets_dict["oml:data"],
         )
-        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-        if len(batches) == 0:
-            return pd.DataFrame()
+        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+            "oml:data"
+        ]["@xmlns:oml"]
+
+        datasets = {}
+        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+            ignore_attribute = ["oml:file_id", "oml:quality"]
+            dataset = {
+                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+            }
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("oml:quality", []):
+                try:
+                    dataset[quality["@name"]] = int(quality["#text"])
+                except ValueError:
+                    dataset[quality["@name"]] = float(quality["#text"])
+            datasets[dataset["did"]] = dataset
 
-        return pd.concat(batches)
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
 
     def delete(self, dataset_id: int) -> bool:
         """Delete dataset with id `dataset_id` from the OpenML server.
@@ -299,90 +303,27 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
-    def _list_datasets(
-        self,
-        limit: int,
-        offset: int,
-        *,
-        data_id: list[int] | None = None,  # type: ignore
-        **kwargs: Any,
-    ) -> pd.DataFrame:
-        """
-        Perform api call to return a list of all datasets.
-
-        Parameters
-        ----------
-        The arguments that are lists are separated from the single value
-        ones which are put into the kwargs.
-        display_errors is also separated from the kwargs since it has a
-        default value.
+    def list_qualities(self) -> list[str]:  # type: ignore
+        """Return list of data qualities available.
 
-        limit : int
-            The maximum number of datasets to show.
-        offset : int
-            The number of datasets to skip, starting from the first.
-        data_id : list, optional
-
-        kwargs : dict, optional
-            Legal filter operators (keys in the dict):
-            tag, status, limit, offset, data_name, data_version, number_instances,
-            number_features, number_classes, number_missing_values.
+        The function performs an API call to retrieve the entire list of
+        data qualities that are computed on the datasets uploaded.
 
         Returns
         -------
-        datasets : dataframe
+        list
         """
-        api_call = "data/list"
-
-        if limit is not None:
-            api_call += f"/limit/{limit}"
-        if offset is not None:
-            api_call += f"/offset/{offset}"
-
-        if kwargs is not None:
-            for operator, value in kwargs.items():
-                if value is not None:
-                    api_call += f"/{operator}/{value}"
-        if data_id is not None:
-            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
-        return self.__list_datasets(api_call=api_call)
-
-    def __list_datasets(self, api_call: str) -> pd.DataFrame:
+        api_call = "data/qualities/list"
         xml_string = self._http.get(api_call).text
-        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
-
+        qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
         # Minimalistic check if the XML is useful
-        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
-            datasets_dict["oml:data"],
-        )
-        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
-            "oml:data"
-        ]["@xmlns:oml"]
-
-        datasets = {}
-        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
-            ignore_attribute = ["oml:file_id", "oml:quality"]
-            dataset = {
-                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
-            }
-            dataset["did"] = int(dataset["did"])
-            dataset["version"] = int(dataset["version"])
+        if "oml:data_qualities_list" not in qualities:
+            raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
 
-            # The number of qualities can range from 0 to infinity
-            for quality in dataset_.get("oml:quality", []):
-                try:
-                    dataset[quality["@name"]] = int(quality["#text"])
-                except ValueError:
-                    dataset[quality["@name"]] = float(quality["#text"])
-            datasets[dataset["did"]] = dataset
+        if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
+            raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
 
-        return pd.DataFrame.from_dict(datasets, orient="index").astype(
-            {
-                "did": int,
-                "version": int,
-                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-            }
-        )
+        return qualities["oml:data_qualities_list"]["oml:quality"]
 
     def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
         """Create a dataset given a xml string.
@@ -435,24 +376,74 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
 
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        """
+        An ontology describes the concept that are described in a feature. An
+        ontology is defined by an URL where the information is provided. Adds
+        an ontology (URL) to a given dataset feature (defined by a dataset id
+        and index). The dataset has to exists on OpenML and needs to have been
+        processed by the evaluation engine.
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to which the feature belongs
+        index : int
+            index of the feature in dataset (0-based)
+        ontology : str
+            URL to ontology (max. 256 characters)
+
+        Returns
+        -------
+        True or throws an OpenML server exception
+        """
+        upload_data: dict[str, int | str] = {
+            "data_id": data_id,
+            "index": index,
+            "ontology": ontology,
+        }
+        self._http.post("data/feature/ontology/add", data=upload_data)
+        # an error will be thrown in case the request was unsuccessful
+        return True
+
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        """
+        Removes an existing ontology (URL) from a given dataset feature (defined
+        by a dataset id and index). The dataset has to exists on OpenML and needs
+        to have been processed by the evaluation engine. Ontology needs to be
+        attached to the specific fearure.
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to which the feature belongs
+        index : int
+            index of the feature in dataset (0-based)
+        ontology : str
+            URL to ontology (max. 256 characters)
+
+        Returns
+        -------
+        True or throws an OpenML server exception
+        """
+        upload_data: dict[str, int | str] = {
+            "data_id": data_id,
+            "index": index,
+            "ontology": ontology,
+        }
+        self._http.post("data/feature/ontology/remove", data=upload_data)
+        # an error will be thrown in case the request was unsuccessful
+        return True
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        if isinstance(dataset_id, int):
-            resolved_id = dataset_id
-        elif dataset_id.isdigit():
-            resolved_id = int(dataset_id)
-        else:
-            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
-
-        path = f"data/{resolved_id}"
+        path = f"data/{dataset_id}"
         response = self._http.get(path)
         json_content = response.json()
         dataset = self._create_dataset_from_json(json_content)
@@ -462,125 +453,7 @@ def get(
 
         return dataset
 
-    def list(  # noqa: PLR0913
-        self,
-        data_id: list[int] | None = None,
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
-    ) -> pd.DataFrame:
-        """Return a dataframe of all dataset which are on OpenML.
-
-        Supports large amount of results.
-
-        Parameters
-        ----------
-        data_id : list, optional
-            A list of data ids, to specify which datasets should be
-            listed
-        offset : int, optional
-            The number of datasets to skip, starting from the first.
-        size : int, optional
-            The maximum number of datasets to show.
-        status : str, optional
-            Should be {active, in_preparation, deactivated}. By
-            default active datasets are returned, but also datasets
-            from another status can be requested.
-        tag : str, optional
-        data_name : str, optional
-        data_version : int, optional
-        number_instances : int | str, optional
-        number_features : int | str, optional
-        number_classes : int | str, optional
-        number_missing_values : int | str, optional
-
-        Returns
-        -------
-        datasets: dataframe
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
-        """
-        listing_call = partial(
-            self._list_datasets,
-            data_id=data_id,
-            status=status,
-            tag=tag,
-            data_name=data_name,
-            data_version=data_version,
-            number_instances=number_instances,
-            number_features=number_features,
-            number_classes=number_classes,
-            number_missing_values=number_missing_values,
-        )
-        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-        if len(batches) == 0:
-            return pd.DataFrame()
-
-        return pd.concat(batches)
-
-    def delete(self, dataset_id: int) -> bool:
-        raise NotImplementedError()
-
-    def edit(  # noqa: PLR0913
-        self,
-        data_id: int,
-        description: str | None = None,
-        creator: str | None = None,
-        contributor: str | None = None,
-        collection_date: str | None = None,
-        language: str | None = None,
-        default_target_attribute: str | None = None,
-        ignore_attribute: str | list[str] | None = None,  # type: ignore
-        citation: str | None = None,
-        row_id_attribute: str | None = None,
-        original_data_url: str | None = None,
-        paper_url: str | None = None,
-    ) -> int:
-        raise NotImplementedError()
-
-    def fork(self, data_id: int) -> int:
-        raise NotImplementedError()
-
-    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
-        """
-        Updates the status of a dataset to either 'active' or 'deactivated'.
-        Please see the OpenML API documentation for a description of the status
-        and all legal status transitions:
-        https://docs.openml.org/concepts/data/#dataset-status
-
-        Parameters
-        ----------
-        data_id : int
-            The data id of the dataset
-        status : str,
-            'active' or 'deactivated'
-        """
-        legal_status = {"active", "deactivated"}
-        if status not in legal_status:
-            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
-
-        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
-        result = self._http.post("datasets/status/update", json=data).json()
-        server_data_id = result["dataset_id"]
-        server_status = result["status"]
-        if status != server_status or int(data_id) != int(server_data_id):
-            # This should never happen
-            raise ValueError("Data id/status does not collide")
-
-    def _list_datasets(
+    def list(
         self,
         limit: int,
         offset: int,
@@ -623,9 +496,6 @@ def _list_datasets(
                 if value is not None:
                     json[operator] = value
 
-        return self.__list_datasets(json=json)
-
-    def __list_datasets(self, json: dict) -> pd.DataFrame:
         api_call = "datasets/list"
         datasets_list = self._http.post(api_call, json=json).json()
         # Minimalistic check if the JSON is useful
@@ -654,6 +524,76 @@ def __list_datasets(self, json: dict) -> pd.DataFrame:
             }
         )
 
+    def delete(self, dataset_id: int) -> bool:
+        raise NotImplementedError()
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        raise NotImplementedError()
+
+    def fork(self, data_id: int) -> int:
+        raise NotImplementedError()
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
+        result = self._http.post("datasets/status/update", json=data).json()
+        server_data_id = result["dataset_id"]
+        server_status = result["status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
+    def list_qualities(self) -> list[str]:  # type: ignore
+        """Return list of data qualities available.
+
+        The function performs an API call to retrieve the entire list of
+        data qualities that are computed on the datasets uploaded.
+
+        Returns
+        -------
+        list
+        """
+        api_call = "datasets/qualities/list"
+        qualities = self._http.get(api_call).json()
+        # Minimalistic check if the XML is useful
+        if "data_qualities_list" not in qualities:
+            raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
+
+        if not isinstance(qualities["data_qualities_list"]["quality"], list):
+            raise TypeError('Error in return json, does not contain "quality" as a list')
+
+        return qualities["data_qualities_list"]["quality"]
+
     def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
         """Create a dataset given a json.
 
@@ -702,3 +642,9 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
             parquet_url=json_content.get("parquet_url"),
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
+
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        raise NotImplementedError()
+
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        raise NotImplementedError()

From 9bcbcb32c232bb35b34e90ad7739de6c938ee5f3 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Fri, 9 Jan 2026 13:01:34 +0530
Subject: [PATCH 09/10] Refactor functions, except get

---
 openml/datasets/functions.py | 181 ++++-------------------------------
 1 file changed, 21 insertions(+), 160 deletions(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index ac5466a44..23cdefdd2 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -5,7 +5,6 @@
 import logging
 import os
 import warnings
-from collections import OrderedDict
 from functools import partial
 from pathlib import Path
 from pyexpat import ExpatError
@@ -22,6 +21,7 @@
 
 import openml._api_calls
 import openml.utils
+from openml._api import api_context
 from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
     OpenMLHashException,
@@ -65,17 +65,7 @@ def list_qualities() -> list[str]:
     -------
     list
     """
-    api_call = "data/qualities/list"
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
-    # Minimalistic check if the XML is useful
-    if "oml:data_qualities_list" not in qualities:
-        raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
-
-    if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
-        raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
-
-    return qualities["oml:data_qualities_list"]["oml:quality"]
+    return api_context.backend.datasets.list_qualities()
 
 
 def list_datasets(
@@ -129,7 +119,7 @@ def list_datasets(
         these are also included as columns.
     """
     listing_call = partial(
-        _list_datasets,
+        api_context.backend.datasets.list,
         data_id=data_id,
         status=status,
         tag=tag,
@@ -147,92 +137,6 @@ def list_datasets(
     return pd.concat(batches)
 
 
-def _list_datasets(
-    limit: int,
-    offset: int,
-    *,
-    data_id: list[int] | None = None,
-    **kwargs: Any,
-) -> pd.DataFrame:
-    """
-    Perform api call to return a list of all datasets.
-
-    Parameters
-    ----------
-    The arguments that are lists are separated from the single value
-    ones which are put into the kwargs.
-    display_errors is also separated from the kwargs since it has a
-    default value.
-
-    limit : int
-        The maximum number of datasets to show.
-    offset : int
-        The number of datasets to skip, starting from the first.
-    data_id : list, optional
-
-    kwargs : dict, optional
-        Legal filter operators (keys in the dict):
-        tag, status, limit, offset, data_name, data_version, number_instances,
-        number_features, number_classes, number_missing_values.
-
-    Returns
-    -------
-    datasets : dataframe
-    """
-    api_call = "data/list"
-
-    if limit is not None:
-        api_call += f"/limit/{limit}"
-    if offset is not None:
-        api_call += f"/offset/{offset}"
-
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            if value is not None:
-                api_call += f"/{operator}/{value}"
-    if data_id is not None:
-        api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
-    return __list_datasets(api_call=api_call)
-
-
-def __list_datasets(api_call: str) -> pd.DataFrame:
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
-
-    # Minimalistic check if the XML is useful
-    assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
-        datasets_dict["oml:data"],
-    )
-    assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
-        "oml:data"
-    ]["@xmlns:oml"]
-
-    datasets = {}
-    for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
-        ignore_attribute = ["oml:file_id", "oml:quality"]
-        dataset = {
-            k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
-        }
-        dataset["did"] = int(dataset["did"])
-        dataset["version"] = int(dataset["version"])
-
-        # The number of qualities can range from 0 to infinity
-        for quality in dataset_.get("oml:quality", []):
-            try:
-                dataset[quality["@name"]] = int(quality["#text"])
-            except ValueError:
-                dataset[quality["@name"]] = float(quality["#text"])
-        datasets[dataset["did"]] = dataset
-
-    return pd.DataFrame.from_dict(datasets, orient="index").astype(
-        {
-            "did": int,
-            "version": int,
-            "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-        }
-    )
-
-
 def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
     expanded_parameter = []
     if isinstance(parameter, str):
@@ -808,14 +712,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
     if status not in legal_status:
         raise ValueError(f"Illegal status value. Legal values: {legal_status}")
 
-    data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
-    result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
-    result = xmltodict.parse(result_xml)
-    server_data_id = result["oml:data_status_update"]["oml:id"]
-    server_status = result["oml:data_status_update"]["oml:status"]
-    if status != server_status or int(data_id) != int(server_data_id):
-        # This should never happen
-        raise ValueError("Data id/status does not collide")
+    api_context.backend.datasets.status_update(data_id=data_id, status=status)
 
 
 def edit_dataset(
@@ -889,43 +786,20 @@ def edit_dataset(
     -------
     Dataset id
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-
-    # compose data edit parameters as xml
-    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
-    xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
-    xml["oml:data_edit_parameters"] = OrderedDict()
-    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
-    xml["oml:data_edit_parameters"]["oml:description"] = description
-    xml["oml:data_edit_parameters"]["oml:creator"] = creator
-    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
-    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
-    xml["oml:data_edit_parameters"]["oml:language"] = language
-    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
-    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
-    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
-    xml["oml:data_edit_parameters"]["oml:citation"] = citation
-    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
-    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
-
-    # delete None inputs
-    for k in list(xml["oml:data_edit_parameters"]):
-        if not xml["oml:data_edit_parameters"][k]:
-            del xml["oml:data_edit_parameters"][k]
-
-    file_elements = {
-        "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
-    }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
-    result_xml = openml._api_calls._perform_api_call(
-        "data/edit",
-        "post",
-        data=form_data,
-        file_elements=file_elements,
+    return api_context.backend.datasets.edit(
+        data_id,
+        description,
+        creator,
+        contributor,
+        collection_date,
+        language,
+        default_target_attribute,
+        ignore_attribute,
+        citation,
+        row_id_attribute,
+        original_data_url,
+        paper_url,
     )
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_edit"]["oml:id"]
-    return int(data_id)
 
 
 def fork_dataset(data_id: int) -> int:
@@ -957,14 +831,7 @@ def fork_dataset(data_id: int) -> int:
     Dataset id of the forked dataset
 
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-    # compose data fork parameters
-    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
-    result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_fork"]["oml:id"]
-    return int(data_id)
+    return api_context.backend.datasets.fork(data_id=data_id)
 
 
 def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
@@ -988,10 +855,7 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
     -------
     True or throws an OpenML server exception
     """
-    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
-    openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
-    # an error will be thrown in case the request was unsuccessful
-    return True
+    return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology)
 
 
 def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
@@ -1014,10 +878,7 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo
     -------
     True or throws an OpenML server exception
     """
-    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
-    openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
-    # an error will be thrown in case the request was unsuccessful
-    return True
+    return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology)
 
 
 def _topic_add_dataset(data_id: int, topic: str) -> int:
@@ -1460,4 +1321,4 @@ def delete_dataset(dataset_id: int) -> bool:
     bool
         True if the deletion was successful. False otherwise.
     """
-    return openml.utils._delete_entity("data", dataset_id)
+    return api_context.backend.datasets.delete(dataset_id)

From 96df5e30b46ea80633cb9593ceacf36ff10c8308 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Fri, 9 Jan 2026 15:38:07 +0530
Subject: [PATCH 10/10] Remove circular import using lazy import

---
 openml/datasets/functions.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 23cdefdd2..6ede42ea9 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -21,7 +21,6 @@
 
 import openml._api_calls
 import openml.utils
-from openml._api import api_context
 from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
     OpenMLHashException,
@@ -65,6 +64,8 @@ def list_qualities() -> list[str]:
     -------
     list
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.list_qualities()
 
 
@@ -118,6 +119,8 @@ def list_datasets(
         If qualities are calculated for the dataset, some of
         these are also included as columns.
     """
+    from openml._api import api_context
+
     listing_call = partial(
         api_context.backend.datasets.list,
         data_id=data_id,
@@ -708,6 +711,8 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
     status : str,
         'active' or 'deactivated'
     """
+    from openml._api import api_context
+
     legal_status = {"active", "deactivated"}
     if status not in legal_status:
         raise ValueError(f"Illegal status value. Legal values: {legal_status}")
@@ -786,6 +791,8 @@ def edit_dataset(
     -------
     Dataset id
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.edit(
         data_id,
         description,
@@ -831,6 +838,8 @@ def fork_dataset(data_id: int) -> int:
     Dataset id of the forked dataset
 
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.fork(data_id=data_id)
 
 
@@ -855,6 +864,8 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
     -------
     True or throws an OpenML server exception
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology)
 
 
@@ -878,6 +889,8 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo
     -------
     True or throws an OpenML server exception
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology)
 
 
@@ -1321,4 +1334,6 @@ def delete_dataset(dataset_id: int) -> bool:
     bool
         True if the deletion was successful. False otherwise.
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.delete(dataset_id)