From d3c14681cca6287b46705272c8d77c107c01706e Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 19 Dec 2024 16:25:11 -0800 Subject: [PATCH 1/6] Revert addition of new fields and some minor updates --- .gitignore | 3 ++ pyproject.toml | 4 ++ src/pybiocfilecache/cache.py | 98 +++++++++++++++-------------------- src/pybiocfilecache/config.py | 9 ---- src/pybiocfilecache/models.py | 20 +++---- tests/test_cache.py | 2 +- 6 files changed, 55 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index 19d4b9b..35e1381 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ MANIFEST # Per-project virtualenvs .venv*/ .conda*/ + +# remove cache +cache \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a7cea75..00aa968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,10 @@ extend-ignore = ["F821"] [tool.ruff.pydocstyle] convention = "google" +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 20 + [tool.ruff.per-file-ignores] "__init__.py" = ["E402", "F401"] diff --git a/src/pybiocfilecache/cache.py b/src/pybiocfilecache/cache.py index 6179e80..9642123 100644 --- a/src/pybiocfilecache/cache.py +++ b/src/pybiocfilecache/cache.py @@ -6,14 +6,14 @@ from time import sleep, time from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union -from sqlalchemy import create_engine, func +from sqlalchemy import create_engine, func, text from sqlalchemy.orm import Session, sessionmaker from sqlalchemy.pool import QueuePool from .config import CacheConfig +from .const import SCHEMA_VERSION from .exceptions import ( BiocCacheError, - CacheSizeLimitError, InvalidRnameError, NoFpathError, RnameExistsError, @@ -25,7 +25,6 @@ copy_or_move, create_tmp_dir, generate_id, - get_file_size, validate_rname, ) @@ -42,9 +41,7 @@ class BiocFileCache: Features: - Resource validation and integrity checking - Cache size management - - File compression - - Resource tagging - - Automatic cleanup of expired resources + - Cleanup of expired resources """ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optional[CacheConfig] = None): @@ -65,15 +62,22 @@ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optiona config = CacheConfig(cache_dir=cache_dir) self.config = config - self._setup_cache_dir() - self._setup_database() + new_db = self._setup_cache_dir() + db_schema_version = self._setup_database(new_db) + + if db_schema_version != SCHEMA_VERSION: + raise RuntimeError(f"Database version is not {SCHEMA_VERSION}.") + self._last_cleanup = datetime.now() - def _setup_cache_dir(self) -> None: + def _setup_cache_dir(self) -> bool: if not self.config.cache_dir.exists(): self.config.cache_dir.mkdir(parents=True, exist_ok=True) + return True - def _setup_database(self) -> None: + return False + + def _setup_database(self, new_database_directory: bool = True) -> None: db_path = self.config.cache_dir / "BiocFileCache.sqlite" self.engine = create_engine( f"sqlite:///{db_path}", @@ -87,6 +91,29 @@ def _setup_database(self) -> None: Base.metadata.create_all(self.engine) self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine) + if new_database_directory is True: + with self.engine.connect() as conn: + result = conn.execute( + text(""" + SELECT value FROM metadata + WHERE key = 'schema_version' + """) + ) + row = result.fetchone() + + if row is not None: + return row[0] + + conn.execute( + text(""" + INSERT INTO metadata (key, value) + VALUES ('schema_version', :version); + """), + {"version": SCHEMA_VERSION}, + ) + + return SCHEMA_VERSION + def _get_detached_resource(self, session: Session, resource: Resource) -> Optional[Resource]: """Get a detached copy of a resource.""" if resource is None: @@ -118,17 +145,6 @@ def get_session(self) -> Iterator[Session]: finally: session.close() - def _check_cache_size(self, new_size: int) -> None: - """Verify cache size limit won't be exceeded.""" - if self.config.max_size_bytes is None: - return - - current_size = self.get_cache_size() - if current_size + new_size > self.config.max_size_bytes: - raise CacheSizeLimitError( - f"Adding {new_size} bytes would exceed cache limit of " f"{self.config.max_size_bytes} bytes" - ) - def _validate_rname(self, rname: str) -> None: """Validate resource name format.""" if not validate_rname(rname, self.config.rname_pattern): @@ -218,7 +234,6 @@ def add( fpath: Union[str, Path], rtype: Literal["local", "web", "relative"] = "local", action: Literal["copy", "move", "asis"] = "copy", - tags: Optional[List[str]] = None, expires: Optional[datetime] = None, ext: bool = False, ) -> Resource: @@ -240,9 +255,6 @@ def add( How to handle the file ("copy", "move", or "asis"). Defaults to ``copy``. - tags: - Optional list of tags for categorization. - expires: Optional expiration datetime. If None, resource never expires. @@ -267,8 +279,6 @@ def add( rid = generate_id() rpath = self.config.cache_dir / f"{rid}{fpath.suffix if ext else ''}" if action != "asis" else fpath - self._check_cache_size(get_file_size(fpath)) - # Create resource record resource = Resource( rid=rid, @@ -276,9 +286,7 @@ def add( rpath=str(rpath), rtype=rtype, fpath=str(fpath), - tags=",".join(tags) if tags else None, expires=expires, - size_bytes=get_file_size(fpath), ) # Store file and update database @@ -287,7 +295,7 @@ def add( session.commit() try: - copy_or_move(fpath, rpath, rname, action, self.config.compression) + copy_or_move(fpath, rpath, rname, action, False) # Calculate and store checksum resource.etag = calculate_file_hash(rpath, self.config.hash_algorithm) @@ -324,7 +332,6 @@ def update( rname: str, fpath: Union[str, Path], action: Literal["copy", "move", "asis"] = "copy", - tags: Optional[List[str]] = None, ) -> Resource: """Update an existing resource. @@ -339,9 +346,6 @@ def update( Either ``copy``, ``move`` or ``asis``. Defaults to ``copy``. - tags: - Optional new list of tags. - Returns: Updated `Resource` object. @@ -358,14 +362,10 @@ def update( old_path = Path(resource.rpath) try: - copy_or_move(fpath, old_path, rname, action, self.config.compression) + copy_or_move(fpath, old_path, rname, action, False) resource.last_modified_time = datetime.now() resource.etag = calculate_file_hash(old_path, self.config.hash_algorithm) - resource.size_bytes = get_file_size(old_path) - - if tags is not None: - resource.tags = ",".join(tags) session.commit() return self._get_detached_resource(session, resource) @@ -404,14 +404,10 @@ def remove(self, rname: str) -> None: session.rollback() raise BiocCacheError(f"Failed to remove resource '{rname}'") from e - def list_resources( - self, tag: Optional[str] = None, rtype: Optional[str] = None, expired: Optional[bool] = None - ) -> List[Resource]: + def list_resources(self, rtype: Optional[str] = None, expired: Optional[bool] = None) -> List[Resource]: """List resources in the cache with optional filtering. Args: - tag: - Filter resources by tag. rtype: Filter resources by type. @@ -429,8 +425,6 @@ def list_resources( with self.get_session() as session: query = session.query(Resource) - if tag: - query = query.filter(Resource.tags.like(f"%{tag}%")) if rtype: query = query.filter(Resource.rtype == rtype) if expired is not None: @@ -468,11 +462,6 @@ def validate_resource(self, resource: Resource) -> bool: logger.error(f"Failed to validate resource: {resource.rname}", exc_info=e) return False - def get_cache_size(self) -> int: - """Get total size of cached files in bytes.""" - with self.get_session() as session: - return session.query(func.sum(Resource.size_bytes)).scalar() or 0 - def export_metadata(self, path: Path) -> None: """Export cache metadata to JSON file.""" data = { @@ -480,14 +469,11 @@ def export_metadata(self, path: Path) -> None: { "rname": r.rname, "rtype": r.rtype, - "tags": r.tags, "expires": r.expires.isoformat() if r.expires else None, "etag": r.etag, - "size_bytes": r.size_bytes, } for r in self.list_resources() ], - "cache_size": self.get_cache_size(), "export_time": datetime.now().isoformat(), } @@ -503,7 +489,6 @@ def import_metadata(self, path: Path) -> None: for resource_data in data["resources"]: resource = self._get(session, resource_data["rname"]) if resource: - resource.tags = resource_data["tags"] resource.expires = ( datetime.fromisoformat(resource_data["expires"]) if resource_data["expires"] else None ) @@ -532,7 +517,7 @@ def search(self, query: str, field: str = "rname", exact: bool = False) -> List[ Search string. field: - Resource field to search ("rname", "rtype", "tags", etc.). + Resource field to search ("rname", "rtype", etc.). exact: Whether to require exact match. @@ -565,7 +550,6 @@ def get_stats(self) -> Dict[str, Any]: return { "total_resources": total, "expired_resources": expired, - "cache_size_bytes": self.get_cache_size(), "resource_types": types, "last_cleanup": self._last_cleanup.isoformat(), "cleanup_enabled": self.config.cleanup_interval is not None, diff --git a/src/pybiocfilecache/config.py b/src/pybiocfilecache/config.py index 9f120ad..e2dc88e 100644 --- a/src/pybiocfilecache/config.py +++ b/src/pybiocfilecache/config.py @@ -16,10 +16,6 @@ class CacheConfig: cache_dir: Directory to store cached files. - max_size_bytes: - Maximum total size of cache. - None for unlimited. - cleanup_interval: How often to run expired resource cleanup. None for no cleanup. @@ -29,14 +25,9 @@ class CacheConfig: hash_algorithm: Algorithm to use for file checksums. - - compression: - Whether to compress cached files. """ cache_dir: Path - max_size_bytes: Optional[int] = None cleanup_interval: Optional[timedelta] = None # timedelta(days=30) rname_pattern: str = r"^[a-zA-Z0-9_-]+$" hash_algorithm: str = "md5" - compression: bool = False diff --git a/src/pybiocfilecache/models.py b/src/pybiocfilecache/models.py index d944575..8040b99 100644 --- a/src/pybiocfilecache/models.py +++ b/src/pybiocfilecache/models.py @@ -13,7 +13,7 @@ class Metadata(Base): __tablename__ = "metadata" - key = Column(Text(), primary_key=True, index=True) + key = Column(Text(), primary_key=True, index=True, unique=True) value = Column(Text()) def __repr__(self) -> str: @@ -56,29 +56,21 @@ class Resource(Base): expires: When the resource should be considered expired. - - tags: - Optional comma-separated tags for categorization. - - size_bytes: - Size of the resource in bytes. """ __tablename__ = "resource" id = Column(Integer, primary_key=True, index=True, autoincrement=True) - rid = Column(Text(), index=True) - rname = Column(Text(), index=True, unique=True) + rid = Column(Text()) + rname = Column(Text()) create_time = Column(DateTime, server_default=func.now()) access_time = Column(DateTime, server_default=func.now()) rpath = Column(Text()) rtype = Column(Text()) fpath = Column(Text()) - last_modified_time = Column(DateTime, onupdate=func.now()) - etag = Column(Text()) - expires = Column(DateTime) - tags = Column(Text()) - size_bytes = Column(Integer) + last_modified_time = Column(DateTime, onupdate=func.now(), default=None) + etag = Column(Text(), default=None) + expires = Column(DateTime, default=None) def __repr__(self) -> str: return f"" diff --git a/tests/test_cache.py b/tests/test_cache.py index 8ce25ce..23924d2 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -17,7 +17,7 @@ def test_create_cache(): bfc.purge() -def test_add_get_operations(): +def test_add_get_list_operations(): bfc = BiocFileCache(CACHE_DIR) rtrip = bfc.add("test1", os.getcwd() + "/tests/data/test1.txt") From 5d2e96713706f1c6b0185d84ec065fdd133888db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 00:25:47 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 35e1381..d2d26c6 100644 --- a/.gitignore +++ b/.gitignore @@ -53,4 +53,4 @@ MANIFEST .conda*/ # remove cache -cache \ No newline at end of file +cache From 395c6f7270f3b570dd24451b40039d39e40b479e Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 19 Dec 2024 16:29:29 -0800 Subject: [PATCH 3/6] fix tests and add 3.13 to tests --- .github/workflows/pypi-test.yml | 2 +- src/pybiocfilecache/cache.py | 47 +++++++++++++++------------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 9dc019a..5c43c4c 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12', '3.13' ] name: Python ${{ matrix.python-version }} steps: diff --git a/src/pybiocfilecache/cache.py b/src/pybiocfilecache/cache.py index 9642123..0f7a806 100644 --- a/src/pybiocfilecache/cache.py +++ b/src/pybiocfilecache/cache.py @@ -62,10 +62,11 @@ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optiona config = CacheConfig(cache_dir=cache_dir) self.config = config - new_db = self._setup_cache_dir() - db_schema_version = self._setup_database(new_db) + self._setup_cache_dir() + db_schema_version = self._setup_database() if db_schema_version != SCHEMA_VERSION: + print(db_schema_version) raise RuntimeError(f"Database version is not {SCHEMA_VERSION}.") self._last_cleanup = datetime.now() @@ -73,11 +74,8 @@ def __init__(self, cache_dir: Optional[Union[str, Path]] = None, config: Optiona def _setup_cache_dir(self) -> bool: if not self.config.cache_dir.exists(): self.config.cache_dir.mkdir(parents=True, exist_ok=True) - return True - - return False - def _setup_database(self, new_database_directory: bool = True) -> None: + def _setup_database(self) -> None: db_path = self.config.cache_dir / "BiocFileCache.sqlite" self.engine = create_engine( f"sqlite:///{db_path}", @@ -91,28 +89,27 @@ def _setup_database(self, new_database_directory: bool = True) -> None: Base.metadata.create_all(self.engine) self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine) - if new_database_directory is True: - with self.engine.connect() as conn: - result = conn.execute( - text(""" - SELECT value FROM metadata - WHERE key = 'schema_version' - """) - ) - row = result.fetchone() + with self.engine.connect() as conn: + result = conn.execute( + text(""" + SELECT value FROM metadata + WHERE key = 'schema_version' + """) + ) + row = result.fetchone() - if row is not None: - return row[0] + if row is not None: + return row[0] - conn.execute( - text(""" - INSERT INTO metadata (key, value) - VALUES ('schema_version', :version); - """), - {"version": SCHEMA_VERSION}, - ) + conn.execute( + text(""" + INSERT INTO metadata (key, value) + VALUES ('schema_version', :version); + """), + {"version": SCHEMA_VERSION}, + ) - return SCHEMA_VERSION + return SCHEMA_VERSION def _get_detached_resource(self, session: Session, resource: Resource) -> Optional[Resource]: """Get a detached copy of a resource.""" From 6cbd5f3a3377a4e3c711a93bec082723dd02559c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 00:52:18 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/pypi-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 4e9342f..90aa16a 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -30,4 +30,4 @@ jobs: - name: Test with tox run: | - tox \ No newline at end of file + tox From 5dd67cfde69afcb24242aedd7ead58d992f8c068 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 19 Dec 2024 16:54:13 -0800 Subject: [PATCH 5/6] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9083c2..e342114 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.6.0 + +- Reverting schema changes that break compatibility with the R/BiocFileCache implementation. +- Added support for Python 3.13 + ## Version 0.5.5 - chore: Remove Python 3.8 (EOL). From fb9c28f731d29e8c0ed0eb0a5195b8300f85c49b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 00:54:41 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e342114..e01ca64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Version 0.6.0 -- Reverting schema changes that break compatibility with the R/BiocFileCache implementation. +- Reverting schema changes that break compatibility with the R/BiocFileCache implementation. - Added support for Python 3.13 ## Version 0.5.5