diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b55b8d..f17a9ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.7.0 + +- Fixing issues with using R and Python interfaces to the same cache directory. +- List resources now returns a `BiocFrame` object. + ## Version 0.6.1 - 0.6.2 - Generate rid's that match with R's cache. diff --git a/README.md b/README.md index 81cb9ec..b362678 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ resource = cache.add("myfile", "path/to/file.txt") resource = cache.get("myfile") # Use the cached file -print(resource.rpath) # Path to cached file +print(resource["rpath"]) # Path to cached file ``` ## Advanced Usage diff --git a/setup.cfg b/setup.cfg index 60b83e4..7a70996 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,10 +10,10 @@ author_email = jayaram.kancherla@gmail.com license = MIT long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8; variant=GFM -url = https://github.com/epiviz/pyBiocFileCache +url = https://github.com/biocpy/pyBiocFileCache # Add here related links, for example: project_urls = - Documentation = https://pyscaffold.org/ + Documentation = https://github.com/biocpy/pyBiocFileCache # Source = https://github.com/pyscaffold/pyscaffold/ # Changelog = https://pyscaffold.org/en/latest/changelog.html # Tracker = https://github.com/pyscaffold/pyscaffold/issues @@ -48,6 +48,7 @@ python_requires = >=3.9 install_requires = importlib-metadata; python_version<"3.8" sqlalchemy + biocframe [options.packages.find] where = src diff --git a/src/pybiocfilecache/cache.py b/src/pybiocfilecache/cache.py index d8ee3ef..3c13bda 100644 --- a/src/pybiocfilecache/cache.py +++ b/src/pybiocfilecache/cache.py @@ -5,6 +5,7 @@ from time import sleep, time from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union +from biocframe import BiocFrame from sqlalchemy import create_engine, func, text from sqlalchemy.orm import Session, sessionmaker from sqlalchemy.pool import QueuePool @@ -14,6 +15,7 @@ from .models import Base, Metadata, Resource from .utils import ( calculate_file_hash, + convert_to_columnar, copy_or_move, create_tmp_dir, download_web_file, @@ -104,15 +106,19 @@ def _setup_database(self) -> None: return SCHEMA_VERSION - def _get_detached_resource( - self, session: Session, obj: Union[Resource, Metadata] - ) -> Optional[Union[Resource, Metadata]]: + def _get_detached_resource(self, session: Session, obj: Union[Resource, Metadata]) -> Optional[dict]: """Get a detached copy of a resource.""" if obj is None: return None session.refresh(obj) session.expunge(obj) - return obj + obj_dict = obj.to_dict() + + if isinstance(obj, Resource): + if obj_dict["rtype"] == "relative": + obj_dict["rpath"] = f"{self.config.cache_dir}/{obj_dict['rpath']}" + + return obj_dict def __enter__(self) -> "BiocFileCache": return self @@ -137,6 +143,10 @@ def get_session(self) -> Iterator[Session]: finally: session.close() + ######################### + ######>> cleanup <<###### + ######################### + # def _validate_rname(self, rname: str) -> None: # """Validate resource name format.""" # if not validate_rname(rname, self.config.rname_pattern): @@ -191,7 +201,11 @@ def cleanup(self) -> int: self._last_cleanup = datetime.now() return removed - def get(self, rname: str = None, rid: str = None) -> Optional[Resource]: + ############################### + ######>> get resources <<###### + ############################### + + def get(self, rname: str = None, rid: str = None) -> Optional[dict]: """Get resource by name from cache. Args: @@ -215,7 +229,7 @@ def get(self, rname: str = None, rid: str = None) -> Optional[Resource]: # Check if path exists with timeout start = time() timeout = 30 - while not Path(str(resource.rpath)).exists(): + while not Path(str(self.config.cache_dir / resource.rpath)).exists(): if time() - start >= timeout: raise TimeoutError(f"For resource: '{rname}' the rpath does not exist after {timeout} seconds.") sleep(0.1) @@ -236,7 +250,7 @@ def add( expires: Optional[datetime] = None, download: bool = True, ext: bool = True, - ) -> Resource: + ) -> dict: """Add a resource to the cache. Args: @@ -268,7 +282,7 @@ def add( Defaults to `True`. Returns: - The `Resource` object added to the cache. + The `Resource` object added to the cache as dictionary. """ # self._validate_rname(rname) fpath = Path(fpath) if rtype != "web" else fpath @@ -289,7 +303,7 @@ def add( # Generate paths and check size rid = generate_id(size=len(self)) uuid = generate_uuid() - rpath = self.config.cache_dir / f"{uuid}_{outpath.name if ext else outpath.stem}" if action != "asis" else fpath + rpath = f"{uuid}_{outpath.name if ext else outpath.stem}" if action != "asis" else fpath # Create resource record resource = Resource( @@ -307,10 +321,10 @@ def add( session.commit() try: - copy_or_move(outpath, rpath, rname, action, False) + copy_or_move(outpath, self.config.cache_dir / rpath, rname, action, False) # Calculate and store checksum - resource.etag = calculate_file_hash(rpath, self.config.hash_algorithm) + resource.etag = calculate_file_hash(self.config.cache_dir / rpath, self.config.hash_algorithm) session.commit() result = self._get_detached_resource(session, resource) return result @@ -320,7 +334,7 @@ def add( session.commit() raise Exception("Failed to add resource") from e - def add_batch(self, resources: List[Dict[str, Any]]) -> List[Resource]: + def add_batch(self, resources: List[Dict[str, Any]]) -> BiocFrame: """Add multiple resources in a single transaction. Args: @@ -344,7 +358,7 @@ def update( rname: str, fpath: Union[str, Path], action: Literal["copy", "move", "asis"] = "copy", - ) -> Resource: + ) -> dict: """Update an existing resource. Args: @@ -359,7 +373,7 @@ def update( Defaults to ``copy``. Returns: - Updated `Resource` object. + Updated `Resource` object as dictionary. """ fpath = Path(fpath) @@ -416,7 +430,7 @@ def remove(self, rname: str) -> None: session.rollback() raise Exception(f"Failed to remove resource '{rname}'") from e - def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> List[Resource]: + def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> BiocFrame: """List resources in the cache with optional filtering. Args: @@ -432,7 +446,7 @@ def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = Note: Resources with no expiration are always considered non-expired. Returns: - List of Resource objects matching the filters + List of Resource objects matching the filters. """ with self.get_session() as session: query = session.query(Resource) @@ -452,7 +466,7 @@ def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = ) resources = query.all() - return [self._get_detached_resource(session, r) for r in resources] + return BiocFrame(convert_to_columnar([self._get_detached_resource(session, r) for r in resources])) def validate_resource(self, resource: Resource) -> bool: """Validate resource integrity. @@ -521,7 +535,7 @@ def verify_cache(self) -> Tuple[int, int]: invalid += 1 return valid, invalid - def search(self, query: str, field: str = "rname", exact: bool = False) -> List[Resource]: + def search(self, query: str, field: str = "rname", exact: bool = False) -> BiocFrame: """Search for resources by field value. Args: @@ -543,7 +557,7 @@ def search(self, query: str, field: str = "rname", exact: bool = False) -> List[ else: resources = session.query(Resource).filter(Resource[field].ilike(f"%{query}%")).all() - return [self._get_detached_resource(session, r) for r in resources] + return BiocFrame(convert_to_columnar([self._get_detached_resource(session, r) for r in resources])) def get_stats(self) -> Dict[str, Any]: """Get statistics about the cache.""" @@ -669,7 +683,7 @@ def add_metadata(self, key: str, value: str): except Exception as e: session.delete(meta) session.commit() - raise Exception("Failed to add metadata") from e + raise Exception("Failed to add metadata", str(e)) from e else: raise Exception(f"'key'={key} already exists in metadata.") diff --git a/src/pybiocfilecache/models.py b/src/pybiocfilecache/models.py index 340bf37..aa414f4 100644 --- a/src/pybiocfilecache/models.py +++ b/src/pybiocfilecache/models.py @@ -19,6 +19,9 @@ class Metadata(Base): def __repr__(self) -> str: return f"" + def to_dict(self) -> dict: + return {"key": self.key, "value": self.value} + class Resource(Base): """Resource information stored in cache. @@ -74,3 +77,18 @@ class Resource(Base): def __repr__(self) -> str: return f"" + + def to_dict(self) -> dict: + return { + "id": self.id, + "rid": self.rid, + "rname": self.rname, + "create_time": self.create_time, + "access_time": self.access_time, + "rpath": self.rpath, + "rtype": self.rtype, + "fpath": self.fpath, + "last_modified_time": self.last_modified_time, + "etag": self.etag, + "expires": self.expires, + } diff --git a/src/pybiocfilecache/utils.py b/src/pybiocfilecache/utils.py index 738cb17..4295af6 100644 --- a/src/pybiocfilecache/utils.py +++ b/src/pybiocfilecache/utils.py @@ -7,7 +7,7 @@ import zlib from pathlib import Path from shutil import copy2, move -from typing import Literal +from typing import List, Literal __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -97,3 +97,16 @@ def download_web_file(url: str, filename: str, download: bool): open(str(outpath), "a").close() return outpath + + +def convert_to_columnar(list_of_dicts: List[dict]): + if not list_of_dicts: + return {} + + column_names = list_of_dicts[0].keys() + result = {col: [] for col in column_names} + + for row in list_of_dicts: + for col in column_names: + result[col].append(row.get(col)) + return result diff --git a/tests/test_cache.py b/tests/test_cache.py index 64b5488..93c2467 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -34,19 +34,19 @@ def test_add_get_list_operations(): rec2 = bfc.get("test2") assert rec2 is not None - frec1 = open(rec1.rpath, "r").read().strip() + frec1 = open(rec1["rpath"], "r").read().strip() assert frec1 == "test1" - frec2 = open(rec2.rpath, "r").read().strip() + frec2 = open(rec2["rpath"], "r").read().strip() assert frec2 == "test2" shutil.copy(os.getcwd() + "/tests/data/test2.txt", os.getcwd() + "/tests/data/test3.txt") bfc.add("test3_asis", os.getcwd() + "/tests/data/test3.txt", action="asis") rec3 = bfc.get("test3_asis") assert rec3 is not None - assert rec3.rpath == os.getcwd() + "/tests/data/test3.txt" + assert rec3["rpath"] == os.getcwd() + "/tests/data/test3.txt" - frec3 = open(rec3.rpath, "r").read().strip() + frec3 = open(rec3["rpath"], "r").read().strip() assert frec3 == "test2" rtrip = bfc.list_resources() @@ -55,8 +55,8 @@ def test_add_get_list_operations(): downurl = "https://bioconductor.org/packages/stats/bioc/BiocFileCache/BiocFileCache_2024_stats.tab" add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web") - row = bfc.get(rid=add_url.rid) - assert row.fpath == downurl + row = bfc.get(rid=add_url["rid"]) + assert row["fpath"] == downurl rtrip = bfc.list_resources() assert len(rtrip) == 4 @@ -99,10 +99,10 @@ def test_meta_operations(): add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web") rec = bfc.get_metadata("schema_version") - assert rec.value == "0.99.4" + assert rec["value"] == "0.99.4" rec = bfc.get_metadata("language") - assert rec.value == "python" + assert rec["value"] == "python" rtrip = bfc.list_resources() assert len(rtrip) == 2