Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## Version 0.7.0

- Fixing issues with using R and Python interfaces to the same cache directory.
- List resources now returns a `BiocFrame` object.

## Version 0.6.1 - 0.6.2

- Generate rid's that match with R's cache.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ resource = cache.add("myfile", "path/to/file.txt")
resource = cache.get("myfile")

# Use the cached file
print(resource.rpath) # Path to cached file
print(resource["rpath"]) # Path to cached file
```

## Advanced Usage
Expand Down
5 changes: 3 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ author_email = jayaram.kancherla@gmail.com
license = MIT
long_description = file: README.md
long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
url = https://github.com/epiviz/pyBiocFileCache
url = https://github.com/biocpy/pyBiocFileCache
# Add here related links, for example:
project_urls =
Documentation = https://pyscaffold.org/
Documentation = https://github.com/biocpy/pyBiocFileCache
# Source = https://github.com/pyscaffold/pyscaffold/
# Changelog = https://pyscaffold.org/en/latest/changelog.html
# Tracker = https://github.com/pyscaffold/pyscaffold/issues
Expand Down Expand Up @@ -48,6 +48,7 @@ python_requires = >=3.9
install_requires =
importlib-metadata; python_version<"3.8"
sqlalchemy
biocframe

[options.packages.find]
where = src
Expand Down
54 changes: 34 additions & 20 deletions src/pybiocfilecache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from time import sleep, time
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union

from biocframe import BiocFrame
from sqlalchemy import create_engine, func, text
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.pool import QueuePool
Expand All @@ -14,6 +15,7 @@
from .models import Base, Metadata, Resource
from .utils import (
calculate_file_hash,
convert_to_columnar,
copy_or_move,
create_tmp_dir,
download_web_file,
Expand Down Expand Up @@ -104,15 +106,19 @@ def _setup_database(self) -> None:

return SCHEMA_VERSION

def _get_detached_resource(
self, session: Session, obj: Union[Resource, Metadata]
) -> Optional[Union[Resource, Metadata]]:
def _get_detached_resource(self, session: Session, obj: Union[Resource, Metadata]) -> Optional[dict]:
"""Get a detached copy of a resource."""
if obj is None:
return None
session.refresh(obj)
session.expunge(obj)
return obj
obj_dict = obj.to_dict()

if isinstance(obj, Resource):
if obj_dict["rtype"] == "relative":
obj_dict["rpath"] = f"{self.config.cache_dir}/{obj_dict['rpath']}"

return obj_dict

def __enter__(self) -> "BiocFileCache":
return self
Expand All @@ -137,6 +143,10 @@ def get_session(self) -> Iterator[Session]:
finally:
session.close()

#########################
######>> cleanup <<######
#########################

# def _validate_rname(self, rname: str) -> None:
# """Validate resource name format."""
# if not validate_rname(rname, self.config.rname_pattern):
Expand Down Expand Up @@ -191,7 +201,11 @@ def cleanup(self) -> int:
self._last_cleanup = datetime.now()
return removed

def get(self, rname: str = None, rid: str = None) -> Optional[Resource]:
###############################
######>> get resources <<######
###############################

def get(self, rname: str = None, rid: str = None) -> Optional[dict]:
"""Get resource by name from cache.

Args:
Expand All @@ -215,7 +229,7 @@ def get(self, rname: str = None, rid: str = None) -> Optional[Resource]:
# Check if path exists with timeout
start = time()
timeout = 30
while not Path(str(resource.rpath)).exists():
while not Path(str(self.config.cache_dir / resource.rpath)).exists():
if time() - start >= timeout:
raise TimeoutError(f"For resource: '{rname}' the rpath does not exist after {timeout} seconds.")
sleep(0.1)
Expand All @@ -236,7 +250,7 @@ def add(
expires: Optional[datetime] = None,
download: bool = True,
ext: bool = True,
) -> Resource:
) -> dict:
"""Add a resource to the cache.

Args:
Expand Down Expand Up @@ -268,7 +282,7 @@ def add(
Defaults to `True`.

Returns:
The `Resource` object added to the cache.
The `Resource` object added to the cache as dictionary.
"""
# self._validate_rname(rname)
fpath = Path(fpath) if rtype != "web" else fpath
Expand All @@ -289,7 +303,7 @@ def add(
# Generate paths and check size
rid = generate_id(size=len(self))
uuid = generate_uuid()
rpath = self.config.cache_dir / f"{uuid}_{outpath.name if ext else outpath.stem}" if action != "asis" else fpath
rpath = f"{uuid}_{outpath.name if ext else outpath.stem}" if action != "asis" else fpath

# Create resource record
resource = Resource(
Expand All @@ -307,10 +321,10 @@ def add(
session.commit()

try:
copy_or_move(outpath, rpath, rname, action, False)
copy_or_move(outpath, self.config.cache_dir / rpath, rname, action, False)

# Calculate and store checksum
resource.etag = calculate_file_hash(rpath, self.config.hash_algorithm)
resource.etag = calculate_file_hash(self.config.cache_dir / rpath, self.config.hash_algorithm)
session.commit()
result = self._get_detached_resource(session, resource)
return result
Expand All @@ -320,7 +334,7 @@ def add(
session.commit()
raise Exception("Failed to add resource") from e

def add_batch(self, resources: List[Dict[str, Any]]) -> List[Resource]:
def add_batch(self, resources: List[Dict[str, Any]]) -> BiocFrame:
"""Add multiple resources in a single transaction.

Args:
Expand All @@ -344,7 +358,7 @@ def update(
rname: str,
fpath: Union[str, Path],
action: Literal["copy", "move", "asis"] = "copy",
) -> Resource:
) -> dict:
"""Update an existing resource.

Args:
Expand All @@ -359,7 +373,7 @@ def update(
Defaults to ``copy``.

Returns:
Updated `Resource` object.
Updated `Resource` object as dictionary.

"""
fpath = Path(fpath)
Expand Down Expand Up @@ -416,7 +430,7 @@ def remove(self, rname: str) -> None:
session.rollback()
raise Exception(f"Failed to remove resource '{rname}'") from e

def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> List[Resource]:
def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> BiocFrame:
"""List resources in the cache with optional filtering.

Args:
Expand All @@ -432,7 +446,7 @@ def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] =
Note: Resources with no expiration are always considered non-expired.

Returns:
List of Resource objects matching the filters
List of Resource objects matching the filters.
"""
with self.get_session() as session:
query = session.query(Resource)
Expand All @@ -452,7 +466,7 @@ def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] =
)

resources = query.all()
return [self._get_detached_resource(session, r) for r in resources]
return BiocFrame(convert_to_columnar([self._get_detached_resource(session, r) for r in resources]))

def validate_resource(self, resource: Resource) -> bool:
"""Validate resource integrity.
Expand Down Expand Up @@ -521,7 +535,7 @@ def verify_cache(self) -> Tuple[int, int]:
invalid += 1
return valid, invalid

def search(self, query: str, field: str = "rname", exact: bool = False) -> List[Resource]:
def search(self, query: str, field: str = "rname", exact: bool = False) -> BiocFrame:
"""Search for resources by field value.

Args:
Expand All @@ -543,7 +557,7 @@ def search(self, query: str, field: str = "rname", exact: bool = False) -> List[
else:
resources = session.query(Resource).filter(Resource[field].ilike(f"%{query}%")).all()

return [self._get_detached_resource(session, r) for r in resources]
return BiocFrame(convert_to_columnar([self._get_detached_resource(session, r) for r in resources]))

def get_stats(self) -> Dict[str, Any]:
"""Get statistics about the cache."""
Expand Down Expand Up @@ -669,7 +683,7 @@ def add_metadata(self, key: str, value: str):
except Exception as e:
session.delete(meta)
session.commit()
raise Exception("Failed to add metadata") from e
raise Exception("Failed to add metadata", str(e)) from e
else:
raise Exception(f"'key'={key} already exists in metadata.")

Expand Down
18 changes: 18 additions & 0 deletions src/pybiocfilecache/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class Metadata(Base):
def __repr__(self) -> str:
return f"<Metadata(key='{self.key}', value='{self.value}')>"

def to_dict(self) -> dict:
return {"key": self.key, "value": self.value}


class Resource(Base):
"""Resource information stored in cache.
Expand Down Expand Up @@ -74,3 +77,18 @@ class Resource(Base):

def __repr__(self) -> str:
return f"<Resource(rid='{self.rid}', rname='{self.rname}', rpath='{self.rpath}')>"

def to_dict(self) -> dict:
return {
"id": self.id,
"rid": self.rid,
"rname": self.rname,
"create_time": self.create_time,
"access_time": self.access_time,
"rpath": self.rpath,
"rtype": self.rtype,
"fpath": self.fpath,
"last_modified_time": self.last_modified_time,
"etag": self.etag,
"expires": self.expires,
}
15 changes: 14 additions & 1 deletion src/pybiocfilecache/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import zlib
from pathlib import Path
from shutil import copy2, move
from typing import Literal
from typing import List, Literal

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
Expand Down Expand Up @@ -97,3 +97,16 @@ def download_web_file(url: str, filename: str, download: bool):
open(str(outpath), "a").close()

return outpath


def convert_to_columnar(list_of_dicts: List[dict]):
if not list_of_dicts:
return {}

column_names = list_of_dicts[0].keys()
result = {col: [] for col in column_names}

for row in list_of_dicts:
for col in column_names:
result[col].append(row.get(col))
return result
16 changes: 8 additions & 8 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ def test_add_get_list_operations():
rec2 = bfc.get("test2")
assert rec2 is not None

frec1 = open(rec1.rpath, "r").read().strip()
frec1 = open(rec1["rpath"], "r").read().strip()
assert frec1 == "test1"

frec2 = open(rec2.rpath, "r").read().strip()
frec2 = open(rec2["rpath"], "r").read().strip()
assert frec2 == "test2"

shutil.copy(os.getcwd() + "/tests/data/test2.txt", os.getcwd() + "/tests/data/test3.txt")
bfc.add("test3_asis", os.getcwd() + "/tests/data/test3.txt", action="asis")
rec3 = bfc.get("test3_asis")
assert rec3 is not None
assert rec3.rpath == os.getcwd() + "/tests/data/test3.txt"
assert rec3["rpath"] == os.getcwd() + "/tests/data/test3.txt"

frec3 = open(rec3.rpath, "r").read().strip()
frec3 = open(rec3["rpath"], "r").read().strip()
assert frec3 == "test2"

rtrip = bfc.list_resources()
Expand All @@ -55,8 +55,8 @@ def test_add_get_list_operations():
downurl = "https://bioconductor.org/packages/stats/bioc/BiocFileCache/BiocFileCache_2024_stats.tab"
add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web")

row = bfc.get(rid=add_url.rid)
assert row.fpath == downurl
row = bfc.get(rid=add_url["rid"])
assert row["fpath"] == downurl

rtrip = bfc.list_resources()
assert len(rtrip) == 4
Expand Down Expand Up @@ -99,10 +99,10 @@ def test_meta_operations():
add_url = bfc.add(rname="download_link", fpath=downurl, rtype="web")

rec = bfc.get_metadata("schema_version")
assert rec.value == "0.99.4"
assert rec["value"] == "0.99.4"

rec = bfc.get_metadata("language")
assert rec.value == "python"
assert rec["value"] == "python"

rtrip = bfc.list_resources()
assert len(rtrip) == 2
Expand Down
Loading