Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6be7217
Update photodownloader.py
Tadusko Oct 31, 2024
d5d840e
Added tags and license to the schema
Tadusko Oct 31, 2024
d4725d5
Also queries tags and license
Tadusko Oct 31, 2024
130064e
Use psycopg2-binary to not build from source
Tadusko Nov 1, 2024
31f4ab9
Catch KeyError when 'photos' is missing from results dict
Tadusko Nov 3, 2024
1802b9d
Check that response isn't empty.
Tadusko Nov 4, 2024
e7e055e
Rest of the additions.
Tadusko Mar 31, 2025
0b32215
Return the number of workers to cpu_count + 1
Tadusko Mar 31, 2025
73b19e9
linted
christophfink Mar 31, 2025
6f3a6ae
Merge remote-tracking branch 'upstream/main'
christophfink Apr 14, 2025
f332d7e
pydocstyle, changed local def
christophfink Apr 14, 2025
05f097d
move nul-cleaner to sqlalchemy validator
christophfink Apr 14, 2025
366051b
major refactoring, WIP
christophfink Apr 14, 2025
d9034cf
cleaning up a bit
christophfink Apr 14, 2025
b87321a
database schema updates (maybe incomplete?)
christophfink Apr 14, 2025
c7f0195
clean redundant licenses, lint throughout
christophfink Apr 14, 2025
1400c32
...
christophfink Apr 14, 2025
4ac47bd
progress, remaining issues:
christophfink Apr 15, 2025
0ac70a0
linted
christophfink Apr 15, 2025
4725b32
Merge remote-tracking branch 'upstream'
christophfink Apr 15, 2025
f46b5ff
typos
christophfink Apr 15, 2025
475261f
do not break out if key missing, but move on to next page
christophfink Apr 15, 2025
08ff0ea
avoid MAGIC NUMBERS
christophfink Apr 15, 2025
3b58815
cleaner string formatting
christophfink Apr 15, 2025
4fa31a0
Merge remote-tracking branch 'upstream/main'
christophfink Apr 15, 2025
7d3a594
refactoring
christophfink Apr 15, 2025
4f5cae7
api parameter
christophfink Apr 15, 2025
8d1a4b3
database concurrency issues
christophfink Apr 15, 2025
03d6fbd
Merge remote-tracking branch 'upstream/main'
christophfink Apr 15, 2025
3c9acbe
update photos that lack tags, accuracy, or license
christophfink Apr 15, 2025
20e5712
prepare for v0.3.0
christophfink Apr 15, 2025
f26ddfc
linted
christophfink Apr 15, 2025
4341592
v0.3.0.dev0
christophfink Apr 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
- **0.3.0** (2025-04-15):
- now collecting more data: tags, license, geo-accuracy
- re-downloading these data for existing records
- database schema versioning
- moved to PyPi TrustedPublisher auth
- migrated to pyproject.toml-only style

- **0.2.1** (2024-01-08):
- migrate to Digital Geography Lab’s GitHub

Expand Down
2 changes: 1 addition & 1 deletion src/flickrhistory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
except ImportError:
pass

__version__ = "0.2.1"
__version__ = "0.3.0.dev0"
14 changes: 13 additions & 1 deletion src/flickrhistory/basicflickrhistorydownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from .cache import Cache
from .cacheupdaterthread import CacheUpdaterThread
from .config import Config
from .licensedownloader import LicenseDownloader
from .photodownloaderthread import PhotoDownloaderThread
from .photoupdaterthread import PhotoUpdaterThread
from .sigtermreceivedexception import SigTermReceivedException
from .timespan import TimeSpan
from .userprofileupdaterthread import UserProfileUpdaterThread
Expand All @@ -30,7 +32,7 @@
class BasicFlickrHistoryDownloader:
"""Download (all) georeferenced flickr posts."""

NUM_WORKERS = multiprocessing.cpu_count() + 1 # 1 == user_profile_updater
NUM_WORKERS = multiprocessing.cpu_count()
NUM_MANAGERS = 2 # main thread + cache_updater

# if output into pipe (e.g. logger, systemd), then
Expand All @@ -54,6 +56,8 @@ def __init__(self):

def download(self):
"""Download all georeferenced flickr posts."""
LicenseDownloader(self._api_key_manager).update_licenses()

for gap in self.gaps_in_download_history:
self._todo_deque.append(gap)

Expand All @@ -74,6 +78,14 @@ def download(self):
worker.start()
self._worker_threads.append(worker)

# start photo record updaters
for i in range(self.NUM_WORKERS):
worker = PhotoUpdaterThread(
self._api_key_manager, (i + 1, self.NUM_WORKERS)
)
worker.start()
self._worker_threads.append(worker)

# start cache updater
self._cache_updater_thread = CacheUpdaterThread(self._done_queue)
self._cache_updater_thread.start()
Expand Down
5 changes: 4 additions & 1 deletion src/flickrhistory/cacheupdaterthread.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def run(self):
try:
newly_downloaded = self._done_queue.get(timeout=0.1)
with Cache() as cache:
cache["already downloaded"] += newly_downloaded
try:
cache["already downloaded"] += newly_downloaded
except KeyError:
cache["already downloaded"] = newly_downloaded
self.status = f"added {newly_downloaded}"
except queue.Empty:
if self.shutdown.is_set():
Expand Down
19 changes: 19 additions & 0 deletions src/flickrhistory/database/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python3


"""Database-related classes."""


__all__ = [
"License",
"Photo",
"PhotoSaver",
"Session",
"User",
"UserSaver",
]

from .engine import Session
from .models import License, Photo, User
from .photo_saver import PhotoSaver
from .user_saver import UserSaver
118 changes: 118 additions & 0 deletions src/flickrhistory/database/databaseschemaupdater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""Update the database schema if necessary."""


__all__ = ["DatabaseSchemaUpdater"]


import sys

import sqlalchemy

from .engine import engine


# for now, schema updates are SQL only and work on PostgreSQL, only.
# GeoAlchemy2 doesn’t really support SQLite, anyway
SCHEMA_UPDATES = {
# 0 -> 1
1: """
ALTER TABLE
photos
ADD COLUMN IF NOT EXISTS
geo_accuracy SMALLINT;

CREATE TABLE IF NOT EXISTS
licenses (
id INTEGER,
name TEXT,
url TEXT
);

ALTER TABLE
photos
ADD COLUMN IF NOT EXISTS
license INTEGER REFERENCES licenses(id);
""",
}


class DatabaseSchemaUpdater:
"""Update the database schema if necessary."""

LATEST = "LATEST" # ‘magic’, see def set_schema_version

def __init__(self):
"""Update the database schema if necessary."""
# Try to create database table for schema version
with engine.begin() as connection:
connection.execute(
sqlalchemy.text(
"""
CREATE TABLE IF NOT EXISTS
schema_versions
(
update TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
version INTEGER PRIMARY KEY
);
"""
)
)

@property
def installed_version(self):
"""Return current version."""
with engine.connect() as connection:
installed_version = connection.execute(
sqlalchemy.text(
"""
SELECT
COALESCE(
MAX(version),
0
) AS version
FROM
schema_versions;
"""
)
).scalar_one_or_none()
return installed_version

def update_to_latest(self):
"""Update to the latest schema version."""
installed_version = self.installed_version
while installed_version < max(SCHEMA_UPDATES.keys()):
print(
"Updating database schema (db version {:d}->{:d})".format(
installed_version, installed_version + 1
),
file=sys.stderr,
flush=True, # so that we don’t seem without work
)
with engine.begin() as connection:
next_version = self.installed_version + 1
connection.execute(sqlalchemy.text(SCHEMA_UPDATES[next_version]))
self.set_schema_version(next_version)
installed_version = self.installed_version

@classmethod
def set_schema_version(cls, version):
"""Set the schema version (without running update scripts)."""
if version == cls.LATEST:
version = max(SCHEMA_UPDATES.keys())
with engine.begin() as connection:
connection.execute(
sqlalchemy.text(
"""
INSERT INTO
schema_versions (version)
VALUES (
:version
);
"""
),
{"version": version},
)
42 changes: 42 additions & 0 deletions src/flickrhistory/database/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""An SQLAlchemy engine and sessionmaker."""


__all__ = ["engine", "Session"]


import multiprocessing

import sqlalchemy
import sqlalchemy.orm

from ..config import Config


POOL_SIZE = multiprocessing.cpu_count() * 10


with Config() as config:
engine = sqlalchemy.create_engine(
config["database_connection_string"],
pool_size=POOL_SIZE,
max_overflow=POOL_SIZE,
)


if engine.dialect.name == "postgresql":
with engine.connect() as connection:
connection.execute(
sqlalchemy.text(
"""
CREATE EXTENSION IF NOT EXISTS
postgis;
"""
)
)


Session = sqlalchemy.orm.sessionmaker(engine, autoflush=False)
31 changes: 31 additions & 0 deletions src/flickrhistory/database/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""ORM models for flickr entities."""


__all__ = [
"License",
"Photo",
"User",
"Tag",
]


import sqlalchemy

from ..engine import engine
from .base import Base
from ..databaseschemaupdater import DatabaseSchemaUpdater
from .license import License
from .photo import Photo
from .tag import Tag
from .user import User


if sqlalchemy.inspect(engine).has_table(Photo.__table__.name): # data exist
DatabaseSchemaUpdater().update_to_latest()
else:
Base.metadata.create_all(engine)
DatabaseSchemaUpdater().set_schema_version(DatabaseSchemaUpdater.LATEST)
54 changes: 54 additions & 0 deletions src/flickrhistory/database/models/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""A common sqlalchemy declarative_base() to share between models."""


__all__ = ["Base"]


import json
import re

import sqlalchemy.ext.declarative
import sqlalchemy.orm


CAMEL_CASE_TO_SNAKE_CASE_RE = re.compile(
"((?<=[a-z0-9])[A-Z]|(?!^)(?<!_)[A-Z](?=[a-z]))"
)


def camel_case_to_snake_case(camel_case):
"""Convert a `camelCase` string to `snake_case`."""
snake_case = CAMEL_CASE_TO_SNAKE_CASE_RE.sub(r"_\1", camel_case).lower()
return snake_case


class Base:
"""Template for sqlalchemy declarative_base() to add shared functionality."""

def __str__(self):
"""Return a str representation."""
primary_keys = {}
for pk in self.__mapper__.primary_key:
try:
primary_keys[pk.name] = getattr(self, pk.name)
except AttributeError: # (not yet set)
pass
return f"<{self.__class__.__name__}({json.dumps(primary_keys)})>"

@sqlalchemy.orm.declared_attr
def __tablename__(cls):
"""Return a table name derived from the class name."""
snake_case = camel_case_to_snake_case(cls.__name__)
return f"{snake_case}s"

def update(self, **kwargs):
"""Update the values of this ORM object from keyword arguments."""
for key, value in kwargs.items():
setattr(self, key, value)


Base = sqlalchemy.ext.declarative.declarative_base(cls=Base)
23 changes: 23 additions & 0 deletions src/flickrhistory/database/models/license.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""ORM class to represent a flickr license."""


__all__ = ["License"]


import sqlalchemy
import sqlalchemy.orm

from .base import Base


class License(Base):
"""ORM class to represent a flickr license."""

id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
name = sqlalchemy.Column(sqlalchemy.Text)
url = sqlalchemy.Column(sqlalchemy.Text)
photos = sqlalchemy.orm.relationship("Photo")
Loading