From 6be721732be8d6f8a122748816942339efa58d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Thu, 31 Oct 2024 23:29:31 +0100 Subject: [PATCH 01/29] Update photodownloader.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed lower limit for raising downloadbatchtoolarge – the API seems to return somewhere around 3500 photos even if the batch is not complete. --- flickrhistory/photodownloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index 1b06241..cc9d664 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -86,12 +86,12 @@ def photos(self): except TypeError: num_photos = 0 - if num_photos > 4000 and self._timespan.duration > datetime.timedelta( + if num_photos > 3000 and self._timespan.duration > datetime.timedelta( seconds=1 ): raise DownloadBatchIsTooLargeError( ( - "More than 4000 rows returned ({:d}), " + "More than 3000 rows returned ({:d}), " + "please specify a shorter time span." ).format(num_photos) ) From d5d840ee19792f0ba6c10f65589d840d843e28a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Fri, 1 Nov 2024 00:19:23 +0100 Subject: [PATCH 02/29] Added tags and license to the schema --- flickrhistory/databaseobjects.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/flickrhistory/databaseobjects.py b/flickrhistory/databaseobjects.py index 6b62c89..bbe3d6e 100644 --- a/flickrhistory/databaseobjects.py +++ b/flickrhistory/databaseobjects.py @@ -170,6 +170,10 @@ class FlickrPhoto(Base): user_id = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=False) user_farm = sqlalchemy.Column(sqlalchemy.SmallInteger, nullable=False) + # New fields for tags and license + tags = sqlalchemy.Column(sqlalchemy.Text) + license = sqlalchemy.Column(sqlalchemy.Integer) + user = sqlalchemy.orm.relationship("FlickrUser", back_populates="photos") __table_args__ = ( @@ -254,6 +258,16 @@ def from_raw_api_data_flickrphotossearch(cls, data): ): pass + try: + photo_data["tags"] = data["tags"] + except KeyError: + pass + + try: + photo_data["license"] = int(data["license"]) + except (ValueError, KeyError): + pass + # finally, the user # (let’s just delegate that to the FlickrUser constructor) photo_data["user"] = FlickrUser.from_raw_api_data_flickrphotossearch(data) From d4725d50381573e247ff7db4ca5d4f82d13a7111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Fri, 1 Nov 2024 00:19:40 +0100 Subject: [PATCH 03/29] Also queries tags and license --- flickrhistory/photodownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index cc9d664..2e8632f 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -52,7 +52,7 @@ def photos(self): "per_page": 500, "has_geo": 1, "extras": ", ".join( - ["description", "date_upload", "date_taken", "geo", "owner_name"] + ["description", "date_upload", "date_taken", "geo", "owner_name", "tags", "license"] ), "min_upload_date": self._timespan.start.timestamp(), "max_upload_date": self._timespan.end.timestamp(), From 130064e00cb391556f2a756105b3ffdaffba2d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:02:09 +0100 Subject: [PATCH 04/29] Use psycopg2-binary to not build from source --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 956ce80..caf241d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,7 @@ install_requires = blessed GeoAlchemy2 PyYAML - psycopg2 + psycopg2-binary requests SQLAlchemy >= 1.4.0b1 urllib3 From 31f4ab9cbfc2bf38ff48a6cb672b4e1619d8804f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Sun, 3 Nov 2024 12:39:28 +0100 Subject: [PATCH 05/29] Catch KeyError when 'photos' is missing from results dict --- flickrhistory/photodownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index 2e8632f..928eb5a 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -83,7 +83,7 @@ def photos(self): try: num_photos = int(results["photos"]["total"]) - except TypeError: + except (TypeError, KeyError): num_photos = 0 if num_photos > 3000 and self._timespan.duration > datetime.timedelta( From 1802b9dc9b1d03b768517272e7a37bba1baf547d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Mon, 4 Nov 2024 21:48:04 +0100 Subject: [PATCH 06/29] Check that response isn't empty. --- flickrhistory/photodownloader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index 928eb5a..de42e38 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -81,9 +81,13 @@ def photos(self): # unsuccessful and start over raise ApiResponseError() from exception + # Check for 'photos' in results to avoid KeyError + if "photos" not in results or "photo" not in results["photos"]: + break + try: num_photos = int(results["photos"]["total"]) - except (TypeError, KeyError): + except TypeError: num_photos = 0 if num_photos > 3000 and self._timespan.duration > datetime.timedelta( From e7e055e46732c0f6645eac2d7252ca5fea4bc4a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= Date: Mon, 31 Mar 2025 16:22:23 +0300 Subject: [PATCH 07/29] Rest of the additions. --- flickrhistory/basicflickrhistorydownloader.py | 2 +- flickrhistory/databaseobjects.py | 19 +++++++++++++++---- flickrhistory/photodownloader.py | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/flickrhistory/basicflickrhistorydownloader.py b/flickrhistory/basicflickrhistorydownloader.py index 886fd78..b0ba219 100644 --- a/flickrhistory/basicflickrhistorydownloader.py +++ b/flickrhistory/basicflickrhistorydownloader.py @@ -45,7 +45,7 @@ class BasicFlickrHistoryDownloader: """Download (all) georeferenced flickr posts.""" - NUM_WORKERS = multiprocessing.cpu_count() + 1 # 1 == user_profile_updater + NUM_WORKERS = multiprocessing.cpu_count() + 4 # 1 == user_profile_updater NUM_MANAGERS = 2 # main thread + cache_updater # if output into pipe (e.g. logger, systemd), then diff --git a/flickrhistory/databaseobjects.py b/flickrhistory/databaseobjects.py index bbe3d6e..e562c66 100644 --- a/flickrhistory/databaseobjects.py +++ b/flickrhistory/databaseobjects.py @@ -173,6 +173,7 @@ class FlickrPhoto(Base): # New fields for tags and license tags = sqlalchemy.Column(sqlalchemy.Text) license = sqlalchemy.Column(sqlalchemy.Integer) + geo_accuracy = sqlalchemy.Column(sqlalchemy.Integer) user = sqlalchemy.orm.relationship("FlickrUser", back_populates="photos") @@ -185,7 +186,11 @@ class FlickrPhoto(Base): @classmethod def from_raw_api_data_flickrphotossearch(cls, data): """Initialise a new FlickrPhoto with a flickr.photos.search data dict.""" - # the API does not always return all fields + # Helper function to clean NUL characters + def clean_string(input_string): + return input_string.replace('\x00', '') if isinstance(input_string, str) else input_string + + # the API does not always return all fields # we need to figure out which ones we can use # and do quite a lot of clean-up because the flickr API @@ -213,12 +218,12 @@ def from_raw_api_data_flickrphotossearch(cls, data): pass try: - photo_data["title"] = data["title"] + photo_data["title"] = clean_string(data["title"]) except KeyError: pass try: - photo_data["description"] = data["description"]["_content"] + photo_data["description"] = clean_string(data["description"]["_content"]) except KeyError: pass @@ -259,7 +264,7 @@ def from_raw_api_data_flickrphotossearch(cls, data): pass try: - photo_data["tags"] = data["tags"] + photo_data["tags"] = clean_string(data["tags"]) except KeyError: pass @@ -268,6 +273,12 @@ def from_raw_api_data_flickrphotossearch(cls, data): except (ValueError, KeyError): pass + try: + photo_data["geo_accuracy"] = int(data["accuracy"]) + except (ValueError, KeyError): + pass + + # finally, the user # (let’s just delegate that to the FlickrUser constructor) photo_data["user"] = FlickrUser.from_raw_api_data_flickrphotossearch(data) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index de42e38..2f14af8 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -84,7 +84,7 @@ def photos(self): # Check for 'photos' in results to avoid KeyError if "photos" not in results or "photo" not in results["photos"]: break - + try: num_photos = int(results["photos"]["total"]) except TypeError: From 0b322159e355556104b8bfc103c23efde12cae7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatu=20Lepp=C3=A4m=C3=A4ki?= <43028964+Tadusko@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:25:13 +0300 Subject: [PATCH 08/29] Return the number of workers to cpu_count + 1 --- flickrhistory/basicflickrhistorydownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flickrhistory/basicflickrhistorydownloader.py b/flickrhistory/basicflickrhistorydownloader.py index b0ba219..886fd78 100644 --- a/flickrhistory/basicflickrhistorydownloader.py +++ b/flickrhistory/basicflickrhistorydownloader.py @@ -45,7 +45,7 @@ class BasicFlickrHistoryDownloader: """Download (all) georeferenced flickr posts.""" - NUM_WORKERS = multiprocessing.cpu_count() + 4 # 1 == user_profile_updater + NUM_WORKERS = multiprocessing.cpu_count() + 1 # 1 == user_profile_updater NUM_MANAGERS = 2 # main thread + cache_updater # if output into pipe (e.g. logger, systemd), then From 73b19e9bbee9570772bd7edbdba58530dbdd6cfd Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 31 Mar 2025 16:23:35 +0200 Subject: [PATCH 09/29] linted --- flickrhistory/databaseobjects.py | 10 +++++++--- flickrhistory/photodownloader.py | 10 +++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/flickrhistory/databaseobjects.py b/flickrhistory/databaseobjects.py index e562c66..bbc4431 100644 --- a/flickrhistory/databaseobjects.py +++ b/flickrhistory/databaseobjects.py @@ -186,11 +186,16 @@ class FlickrPhoto(Base): @classmethod def from_raw_api_data_flickrphotossearch(cls, data): """Initialise a new FlickrPhoto with a flickr.photos.search data dict.""" + # Helper function to clean NUL characters def clean_string(input_string): - return input_string.replace('\x00', '') if isinstance(input_string, str) else input_string + return ( + input_string.replace("\x00", "") + if isinstance(input_string, str) + else input_string + ) - # the API does not always return all fields + # the API does not always return all fields # we need to figure out which ones we can use # and do quite a lot of clean-up because the flickr API @@ -278,7 +283,6 @@ def clean_string(input_string): except (ValueError, KeyError): pass - # finally, the user # (let’s just delegate that to the FlickrUser constructor) photo_data["user"] = FlickrUser.from_raw_api_data_flickrphotossearch(data) diff --git a/flickrhistory/photodownloader.py b/flickrhistory/photodownloader.py index 2f14af8..68b775b 100644 --- a/flickrhistory/photodownloader.py +++ b/flickrhistory/photodownloader.py @@ -52,7 +52,15 @@ def photos(self): "per_page": 500, "has_geo": 1, "extras": ", ".join( - ["description", "date_upload", "date_taken", "geo", "owner_name", "tags", "license"] + [ + "description", + "date_upload", + "date_taken", + "geo", + "owner_name", + "tags", + "license", + ] ), "min_upload_date": self._timespan.start.timestamp(), "max_upload_date": self._timespan.end.timestamp(), From f332d7ef43de44cf99cb20af9e8bd007af483da0 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 13:05:29 +0200 Subject: [PATCH 10/29] pydocstyle, changed local def --- src/flickrhistory/databaseobjects.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/flickrhistory/databaseobjects.py b/src/flickrhistory/databaseobjects.py index bbc4431..43fd67c 100644 --- a/src/flickrhistory/databaseobjects.py +++ b/src/flickrhistory/databaseobjects.py @@ -186,15 +186,6 @@ class FlickrPhoto(Base): @classmethod def from_raw_api_data_flickrphotossearch(cls, data): """Initialise a new FlickrPhoto with a flickr.photos.search data dict.""" - - # Helper function to clean NUL characters - def clean_string(input_string): - return ( - input_string.replace("\x00", "") - if isinstance(input_string, str) - else input_string - ) - # the API does not always return all fields # we need to figure out which ones we can use @@ -223,12 +214,12 @@ def clean_string(input_string): pass try: - photo_data["title"] = clean_string(data["title"]) + photo_data["title"] = data["title"].replace("\x00", "") except KeyError: pass try: - photo_data["description"] = clean_string(data["description"]["_content"]) + photo_data["description"] = data["description"]["_content"].replace("\x00", "") except KeyError: pass @@ -269,7 +260,7 @@ def clean_string(input_string): pass try: - photo_data["tags"] = clean_string(data["tags"]) + photo_data["tags"] = data["tags"].replace("\x00", "") except KeyError: pass From 05f097de1f9e55826df3136ce5ccf1b0a485930b Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 13:13:48 +0200 Subject: [PATCH 11/29] move nul-cleaner to sqlalchemy validator --- src/flickrhistory/databaseobjects.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/flickrhistory/databaseobjects.py b/src/flickrhistory/databaseobjects.py index 43fd67c..ed880df 100644 --- a/src/flickrhistory/databaseobjects.py +++ b/src/flickrhistory/databaseobjects.py @@ -214,12 +214,12 @@ def from_raw_api_data_flickrphotossearch(cls, data): pass try: - photo_data["title"] = data["title"].replace("\x00", "") + photo_data["title"] = data["title"] except KeyError: pass try: - photo_data["description"] = data["description"]["_content"].replace("\x00", "") + photo_data["description"] = data["description"]["_content"] except KeyError: pass @@ -260,7 +260,7 @@ def from_raw_api_data_flickrphotossearch(cls, data): pass try: - photo_data["tags"] = data["tags"].replace("\x00", "") + photo_data["tags"] = data["tags"] except KeyError: pass @@ -280,6 +280,10 @@ def from_raw_api_data_flickrphotossearch(cls, data): return cls(**photo_data) + @sqlalchemy.orm.validates("title", "description", "tags") + def _drop_nul_from_strings(self, key, address): + return address.replace("\x00", "") + def __str__(self): """Return a str representation.""" return "".format(self.id) From 366051bceaf5458138481020fbf95d9d7265a661 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 15:41:50 +0200 Subject: [PATCH 12/29] major refactoring, WIP --- .../basicflickrhistorydownloader.py | 3 + src/flickrhistory/database/__init__.py | 15 + .../database/databaseschemaupdater.py | 101 ++++++ src/flickrhistory/database/engine.py | 30 ++ src/flickrhistory/database/models/__init__.py | 33 ++ src/flickrhistory/database/models/base.py | 77 +++++ src/flickrhistory/database/models/license.py | 23 ++ src/flickrhistory/database/models/photo.py | 85 +++++ src/flickrhistory/database/models/tag.py | 40 +++ src/flickrhistory/database/models/user.py | 111 +++++++ src/flickrhistory/database/photo_saver.py | 119 +++++++ src/flickrhistory/database/user_saver.py | 66 ++++ src/flickrhistory/databaseobjects.py | 299 ------------------ src/flickrhistory/licensedownloader.py | 61 ++++ src/flickrhistory/photodownloaderthread.py | 50 +-- src/flickrhistory/userprofileupdaterthread.py | 20 +- 16 files changed, 776 insertions(+), 357 deletions(-) create mode 100644 src/flickrhistory/database/__init__.py create mode 100644 src/flickrhistory/database/databaseschemaupdater.py create mode 100644 src/flickrhistory/database/engine.py create mode 100644 src/flickrhistory/database/models/__init__.py create mode 100644 src/flickrhistory/database/models/base.py create mode 100644 src/flickrhistory/database/models/license.py create mode 100644 src/flickrhistory/database/models/photo.py create mode 100644 src/flickrhistory/database/models/tag.py create mode 100644 src/flickrhistory/database/models/user.py create mode 100644 src/flickrhistory/database/photo_saver.py create mode 100644 src/flickrhistory/database/user_saver.py delete mode 100644 src/flickrhistory/databaseobjects.py create mode 100644 src/flickrhistory/licensedownloader.py diff --git a/src/flickrhistory/basicflickrhistorydownloader.py b/src/flickrhistory/basicflickrhistorydownloader.py index 886fd78..46978de 100644 --- a/src/flickrhistory/basicflickrhistorydownloader.py +++ b/src/flickrhistory/basicflickrhistorydownloader.py @@ -36,6 +36,7 @@ from .cache import Cache from .cacheupdaterthread import CacheUpdaterThread from .config import Config +from .licensedownloader import LicenseDownloader from .photodownloaderthread import PhotoDownloaderThread from .sigtermreceivedexception import SigTermReceivedException from .timespan import TimeSpan @@ -69,6 +70,8 @@ def __init__(self): def download(self): """Download all georeferenced flickr posts.""" + LicenseDownloader(self._api_key_manager).update_licenses() + for gap in self.gaps_in_download_history: self._todo_deque.append(gap) diff --git a/src/flickrhistory/database/__init__.py b/src/flickrhistory/database/__init__.py new file mode 100644 index 0000000..bab5938 --- /dev/null +++ b/src/flickrhistory/database/__init__.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + + +"""Database-related classes.""" + + +__all__ = [ + "License", + "Photo", + "PhotoSaver", + "User", +] + +from .models import License, Photo, User +from .photo_saver import PhotoSaver diff --git a/src/flickrhistory/database/databaseschemaupdater.py b/src/flickrhistory/database/databaseschemaupdater.py new file mode 100644 index 0000000..5fc70d2 --- /dev/null +++ b/src/flickrhistory/database/databaseschemaupdater.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""Update the database schema if necessary.""" + + +__all__ = ["DatabaseSchemaUpdater"] + + +import sys + +import sqlalchemy + +from .engine import engine + + +# for now, schema updates are SQL only and work on PostgreSQL, only. +# GeoAlchemy2 doesn’t really support SQLite, anyway +SCHEMA_UPDATES = { + # 0 -> 1 + 1: """ + CREATE TABLE bliblably; + """, +} + + +class DatabaseSchemaUpdater: + """Update the database schema if necessary.""" + + LATEST = "LATEST" # ‘magic’, see def set_schema_version + + def __init__(self): + """Update the database schema if necessary.""" + # Try to create database table for schema version + with engine.begin() as connection: + connection.execute( + """ + CREATE TABLE IF NOT EXISTS + schema_versions + ( + update TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + version INTEGER PRIMARY KEY + ); + """ + ) + + @property + def installed_version(self): + """Return current version.""" + with engine.connect() as connection: + installed_version = connection.execute( + sqlalchemy.text( + """ + SELECT + COALESCE( + MAX(version), + 0 + ) AS version + FROM + schema_versions; + """ + ) + ).scalar_one_or_none() + return installed_version + + def update_to_latest(self): + """Update to the latest schema version.""" + installed_version = self.installed_version + while installed_version < max(SCHEMA_UPDATES.keys()): + print( + "Updating database schema (db version {:d}->{:d})".format( + installed_version, installed_version + 1 + ), + file=sys.stderr, + flush=True, # so that we don’t seem without work + ) + with engine.begin() as connection: + next_version = self.installed_version + 1 + connection.execute(sqlalchemy.text(SCHEMA_UPDATES[next_version])) + self.set_schema_version(next_version) + installed_version = self.installed_version + + @classmethod + def set_schema_version(cls, version): + """Set the schema version (without running update scripts).""" + if version == cls.LATEST: + version = max(SCHEMA_UPDATES.keys()) + with engine.begin() as connection: + connection.execute( + sqlalchemy.text( + """ + INSERT INTO + schema_versions (version) + VALUES ( + :version + ); + """ + ), + {"version": version}, + ) diff --git a/src/flickrhistory/database/engine.py b/src/flickrhistory/database/engine.py new file mode 100644 index 0000000..1dd04e1 --- /dev/null +++ b/src/flickrhistory/database/engine.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""An SQLAlchemy engine and sessionmaker.""" + + +__all__ = ["engine", "Session"] + + +import sqlalchemy +import sqlalchemy.orm + +from ..config import Config + + +with Config() as config: + engine = sqlalchemy.create_engine(config["database_connection_string"]) + + +if engine.dialect.name == "postgresql": + engine.execute( + """ + CREATE EXTENSION IF NOT EXISTS + postgis; + """ + ) + + +Session = sqlalchemy.orm.sessionmaker(engine, autoflush=False) diff --git a/src/flickrhistory/database/models/__init__.py b/src/flickrhistory/database/models/__init__.py new file mode 100644 index 0000000..d5ac638 --- /dev/null +++ b/src/flickrhistory/database/models/__init__.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""ORM models for flickr entities.""" + + +__all__ = [ + "FlickrPhoto", + "FlickrUser", + "License", + "Photo", + "User", + "Tag", +] + + +import sqlalchemy + +from ..engine import engine +from .base import Base +from .database_schema_updater import DatabaseSchemaUpdater +from .license import License +from .photo import FlickrPhoto, Photo +from .tag import Tag +from .user import FlickrUser, User + + +if sqlalchemy.inspect(engine).has_table(Photo.__table__): # data exists + DatabaseSchemaUpdater().update_to_latest() +else: + Base.metadata.create_all(engine) + DatabaseSchemaUpdater().set_schema_version(DatabaseSchemaUpdater.LATEST) diff --git a/src/flickrhistory/database/models/base.py b/src/flickrhistory/database/models/base.py new file mode 100644 index 0000000..003370e --- /dev/null +++ b/src/flickrhistory/database/models/base.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright (C) 2019 Christoph Fink, University of Helsinki +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +"""A common sqlalchemy declarative_base() to share between models.""" + + +__all__ = ["Base"] + + +import json +import re + +import sqlalchemy.ext.declarative +import sqlalchemy.orm + +from ...config import Config + + +CAMEL_CASE_TO_SNAKE_CASE_RE = re.compile( + "((?<=[a-z0-9])[A-Z]|(?!^)(? bogus + KeyError, # not contained in API dict + TypeError, # weird data returned + ): + pass + + photo_data["geographical_accuracy"] = int(data["accuracy"]) + + license = int(data["license"]) + + tags = data["tags"].split() + + with Session() as session, session.begin(): + + photo = Photo(**photo_data) + + photo.tags = [] + for tag in tags: + tag = session.get(Tag, tag) or Tag(tag=tag) + if tag not in photo.tags: + photo.tags.append(tag) + + photo.license = session.get(License, license) or License(name=license) + + user = UserSaver().save(data) + photo.user = user + + session.merge(photo) + return photo diff --git a/src/flickrhistory/database/user_saver.py b/src/flickrhistory/database/user_saver.py new file mode 100644 index 0000000..9936122 --- /dev/null +++ b/src/flickrhistory/database/user_saver.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""Save a flickr user to the database.""" + + +import datetime + +from .models import User +from .engine import Session + + +__all__ = ["UserSaver"] + + +class UserSaver: + """Save a flickr user to the database.""" + + def save(data): + """Save a flickr user to the database.""" + # the API does not always return all fields + + # "id" is the only field garantueed to be in the data + user_id, farm = data["id"].split("@N0") + user_data = {} + + # "joindate" and "ownername" need special attentation + try: + data["join_date"] = datetime.datetime.fromtimestamp( + int(data["join_date"]), tz=datetime.timezone.utc + ) + except KeyError: + pass + try: + data["name"] = data["ownername"] + except KeyError: + pass + + # all the other fields can be added as they are (if they exist) + for field in [ + "first_name", + "last_name", + "name", + "join_date", + "city", + "country", + "hometown", + "occupation", + "description", + "website", + "facebook", + "twitter", + "tumblr", + "instagram", + "pinterest", + ]: + try: + user_data[field] = data[field] + except KeyError: + pass + + with Session() as session, session.begin(): + user = User(**user_data) + session.merge(user) + return user diff --git a/src/flickrhistory/databaseobjects.py b/src/flickrhistory/databaseobjects.py deleted file mode 100644 index ed880df..0000000 --- a/src/flickrhistory/databaseobjects.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - -"""Base classes to represent flickr posts and users.""" - - -__all__ = ["FlickrPhoto", "FlickrUser"] - - -import datetime - -import geoalchemy2 -import sqlalchemy -import sqlalchemy.ext.declarative -import sqlalchemy.ext.hybrid -import sqlalchemy.orm - -from .config import Config - - -Base = sqlalchemy.ext.declarative.declarative_base() -config = Config() - - -class FlickrUser(Base): - """ORM class to represent a flickr user.""" - - __tablename__ = "users" - - id = sqlalchemy.Column(sqlalchemy.BigInteger) - farm = sqlalchemy.Column(sqlalchemy.SmallInteger) - nsid = sqlalchemy.Column( - sqlalchemy.Text, sqlalchemy.Computed("id::TEXT || '@N0' || farm::TEXT") - ) - - name = sqlalchemy.Column(sqlalchemy.Text) - first_name = sqlalchemy.Column(sqlalchemy.Text) - last_name = sqlalchemy.Column(sqlalchemy.Text) - real_name = sqlalchemy.Column( - sqlalchemy.Text, sqlalchemy.Computed("first_name || ' ' || last_name") - ) - - city = sqlalchemy.Column(sqlalchemy.Text) - country = sqlalchemy.Column(sqlalchemy.Text) - hometown = sqlalchemy.Column(sqlalchemy.Text) - - occupation = sqlalchemy.Column(sqlalchemy.Text) - description = sqlalchemy.Column(sqlalchemy.Text) - - join_date = sqlalchemy.Column(sqlalchemy.DateTime(timezone=True)) - - website = sqlalchemy.Column(sqlalchemy.Text) - facebook = sqlalchemy.Column(sqlalchemy.Text) - twitter = sqlalchemy.Column(sqlalchemy.Text) - tumblr = sqlalchemy.Column(sqlalchemy.Text) - instagram = sqlalchemy.Column(sqlalchemy.Text) - pinterest = sqlalchemy.Column(sqlalchemy.Text) - - photos = sqlalchemy.orm.relationship("FlickrPhoto", back_populates="user") - - __table_args__ = (sqlalchemy.PrimaryKeyConstraint("id", "farm"),) - - @classmethod - def from_raw_api_data_flickrphotossearch(cls, data): - """Initialise a new FlickrUser with a flickr.photos.search data dict.""" - user_id, farm = data["owner"].split("@N0") - user_data = {"id": user_id, "farm": farm, "name": data["ownername"]} - return cls(**user_data) - - @classmethod - def from_raw_api_data_flickrprofilegetprofile(cls, data): - """Initialise a new FlickrUser with a flickr.profile.getProfile data dict.""" - # the API does not always return all fields - - # "id" is the only field garantueed to be in the data - # (because we add it ourselves in databaseobjects.py in case parsing fails) - user_id, farm = data["id"].split("@N0") - - # "joindate" needs special attentation - try: - join_date = datetime.datetime.fromtimestamp( - int(data["join_date"]), tz=datetime.timezone.utc - ) - except KeyError: - join_date = None - - user_data = {"id": user_id, "farm": farm, "join_date": join_date} - - # all the other fields can be added as they are (if they exist) - for field in [ - "first_name", - "last_name", - "city", - "country", - "hometown", - "occupation", - "description", - "website", - "facebook", - "twitter", - "tumblr", - "instagram", - "pinterest", - ]: - try: - user_data[field] = data[field] - except KeyError: - pass - - return cls(**user_data) - - def __str__(self): - """Return a str representation.""" - return "".format(self.id, self.farm) - - def __repr(self): - """Return a str representation.""" - return str(self) - - -class FlickrPhoto(Base): - """ORM class to represent a flickr photo (posts).""" - - __tablename__ = "photos" - - id = sqlalchemy.Column(sqlalchemy.BigInteger, primary_key=True) - - server = sqlalchemy.Column(sqlalchemy.Integer) - secret = sqlalchemy.Column(sqlalchemy.LargeBinary) - - title = sqlalchemy.Column(sqlalchemy.Text) - description = sqlalchemy.Column(sqlalchemy.Text) - - date_taken = sqlalchemy.Column(sqlalchemy.DateTime(timezone=True)) - date_posted = sqlalchemy.Column(sqlalchemy.DateTime(timezone=True)) - - photo_url = sqlalchemy.Column( - sqlalchemy.Text, - sqlalchemy.Computed( - "'https://live.staticflickr.com/' || server::TEXT || '/' || " - + "id::TEXT || '_' || encode(secret, 'hex') || '_z.jpg'" - ), - ) - page_url = sqlalchemy.Column( - sqlalchemy.Text, - sqlalchemy.Computed( - "'https://www.flickr.com/photos/' || " - + "user_id::TEXT || '@N0' || user_farm::TEXT || '/' || " - + "id::TEXT || '/'" - ), - ) - - geom = sqlalchemy.Column(geoalchemy2.Geometry("POINT", 4326)) - - user_id = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=False) - user_farm = sqlalchemy.Column(sqlalchemy.SmallInteger, nullable=False) - - # New fields for tags and license - tags = sqlalchemy.Column(sqlalchemy.Text) - license = sqlalchemy.Column(sqlalchemy.Integer) - geo_accuracy = sqlalchemy.Column(sqlalchemy.Integer) - - user = sqlalchemy.orm.relationship("FlickrUser", back_populates="photos") - - __table_args__ = ( - sqlalchemy.ForeignKeyConstraint( - ["user_id", "user_farm"], ["users.id", "users.farm"], "FlickrUser" - ), - ) - - @classmethod - def from_raw_api_data_flickrphotossearch(cls, data): - """Initialise a new FlickrPhoto with a flickr.photos.search data dict.""" - # the API does not always return all fields - # we need to figure out which ones we can use - - # and do quite a lot of clean-up because the flickr API - # also returns fairly weird data, sometimes - - # another side effect is that we can initialise - # with incomplete data (only id needed), - # which helps with bad API responses - - photo_data = {} - - # "id" is the only field garantueed to be in the data - # (because we add it ourselves in databaseobjects.py in case parsing fails) - photo_data["id"] = data["id"] - - # server and secret are kinda straight-forward - try: - photo_data["server"] = data["server"] - except KeyError: - pass - - try: - photo_data["secret"] = bytes.fromhex(data["secret"]) - except (ValueError, KeyError): # some non-hex character - pass - - try: - photo_data["title"] = data["title"] - except KeyError: - pass - - try: - photo_data["description"] = data["description"]["_content"] - except KeyError: - pass - - # the dates need special attention - try: - photo_data["date_taken"] = datetime.datetime.fromisoformat( - data["datetaken"] - ).astimezone(datetime.timezone.utc) - except ValueError: - # there is weirdly quite a lot of photos with - # date_taken "0000-01-01 00:00:00" - # Year 0 does not exist, there’s 1BCE, then 1CE, nothing in between - photo_data["date_taken"] = None - except KeyError: - # field does not exist in the dict we got - pass - - try: - photo_data["date_posted"] = datetime.datetime.fromtimestamp( - int(data["dateupload"]), tz=datetime.timezone.utc - ) - except KeyError: - pass - - # geometry - try: - longitude = float(data["longitude"]) - latitude = float(data["latitude"]) - assert longitude != 0 and latitude != 0 - photo_data["geom"] = "SRID=4326;POINT({longitude:f} {latitude:f})".format( - longitude=longitude, latitude=latitude - ) - except ( - AssertionError, # lon/lat is at exactly 0°N/S, 0°W/E -> bogus - KeyError, # not contained in API dict - TypeError, # weird data returned - ): - pass - - try: - photo_data["tags"] = data["tags"] - except KeyError: - pass - - try: - photo_data["license"] = int(data["license"]) - except (ValueError, KeyError): - pass - - try: - photo_data["geo_accuracy"] = int(data["accuracy"]) - except (ValueError, KeyError): - pass - - # finally, the user - # (let’s just delegate that to the FlickrUser constructor) - photo_data["user"] = FlickrUser.from_raw_api_data_flickrphotossearch(data) - - return cls(**photo_data) - - @sqlalchemy.orm.validates("title", "description", "tags") - def _drop_nul_from_strings(self, key, address): - return address.replace("\x00", "") - - def __str__(self): - """Return a str representation.""" - return "".format(self.id) - - def __repr(self): - """Return a str representation.""" - return str(self) - - -# Create tables in case we know where -if "database_connection_string" in config: - engine = sqlalchemy.create_engine(config["database_connection_string"]) - Base.metadata.create_all(engine) diff --git a/src/flickrhistory/licensedownloader.py b/src/flickrhistory/licensedownloader.py new file mode 100644 index 0000000..ffb48db --- /dev/null +++ b/src/flickrhistory/licensedownloader.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""Update the list of licenses.""" + + +__all__ = ["LicenseDownloader"] + + +import json + +import requests +import urllib3 + +from .database import License, Session +from .exceptions import ApiResponseError + + +class LicenseDownloader: + """Update the list of licenses.""" + + API_ENDPOINT_URL = "https://api.flickr.com/services/rest/" + + def __init__(self, api_key_manager): + """Update the list of licenses.""" + self._api_key_manager = api_key_manager + + def update_license(self): + """Update the list of licenses.""" + query = { + "method": "flickr.photos.licenses.getInfo", + "format": "json", + } + + with self._api_key_manager.get_api_key() as api_key: + params = {"api_key": api_key} + params.update(query) + + try: + with requests.get(self.API_ENDPOINT_URL, params=params) as response: + results = response.json() + except ( + ConnectionError, + json.decoder.JSONDecodeError, + requests.exceptions.RequestException, + urllib3.exceptions.HTTPError, + ) as exception: + raise ApiResponseError() from exception + + with Session() as session, session.begin(): + for license in results["licenses"]: + license_id = license["id"] + license_name = license["name"] + license_url = license["url"] + license = session.get(License, license_id) or License( + id=license_id, + name=license_name, + url=license_url, + ) + session.merge(license) diff --git a/src/flickrhistory/photodownloaderthread.py b/src/flickrhistory/photodownloaderthread.py index 3eb5cf5..b997fd9 100644 --- a/src/flickrhistory/photodownloaderthread.py +++ b/src/flickrhistory/photodownloaderthread.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Worker threads wrapping an APIDownloader.""" @@ -24,12 +9,8 @@ import threading -import time - -import sqlalchemy -from .config import Config -from .databaseobjects import FlickrPhoto +from .database import PhotoSaver from .exceptions import ApiResponseError, DownloadBatchIsTooLargeError from .photodownloader import PhotoDownloader @@ -59,11 +40,6 @@ def __init__(self, api_key_manager, todo_deque, done_queue): self.shutdown = threading.Event() - with Config() as config: - self._engine = sqlalchemy.create_engine( - config["database_connection_string"] - ) - def run(self): """Get TimeSpans off todo_deque and download photos.""" while not self.shutdown.is_set(): @@ -76,34 +52,14 @@ def run(self): try: for photo in photo_downloader.photos: - with sqlalchemy.orm.Session(self._engine) as session: - try: - with session.begin(): - flickr_photo = ( - FlickrPhoto.from_raw_api_data_flickrphotossearch( - photo - ) - ) - session.merge(flickr_photo) - except sqlalchemy.exc.IntegrityError: - # remedy race conditions - # TODO: find out how to avoid them - time.sleep(1.0) - with session.begin(): - session.flush() - flickr_photo = ( - FlickrPhoto.from_raw_api_data_flickrphotossearch( - photo - ) - ) - session.merge(flickr_photo) + PhotoSaver().save(photo) self.count += 1 if self.shutdown.is_set(): # let’s only report back on how much we # in fact downloaded, not what our quota was - timespan.end = flickr_photo.date_posted + timespan.end = photo.date_posted break except ApiResponseError: diff --git a/src/flickrhistory/userprofileupdaterthread.py b/src/flickrhistory/userprofileupdaterthread.py index 6b1f9bc..5250525 100644 --- a/src/flickrhistory/userprofileupdaterthread.py +++ b/src/flickrhistory/userprofileupdaterthread.py @@ -29,7 +29,7 @@ import sqlalchemy from .config import Config -from .databaseobjects import FlickrUser +from .database.models import User from .exceptions import ApiResponseError from .userprofiledownloader import UserProfileDownloader @@ -83,26 +83,26 @@ def nsids_of_users_without_detailed_information(self): with sqlalchemy.orm.Session(self._engine) as session: if self._bounds is None: nsids_of_users_without_detailed_information = session.query( - FlickrUser.nsid + User.nsid ).filter_by(join_date=None) else: bounds = ( sqlalchemy.select( sqlalchemy.sql.functions.percentile_disc(self._bounds[0]) - .within_group(FlickrUser.id) + .within_group(User.id) .label("lower"), sqlalchemy.sql.functions.percentile_disc(self._bounds[1]) - .within_group(FlickrUser.id) + .within_group(User.id) .label("upper"), ) - .select_from(FlickrUser) + .select_from(User) .filter_by(join_date=None) .cte() ) nsids_of_users_without_detailed_information = ( - session.query(FlickrUser.nsid) + session.query(User.nsid) .filter_by(join_date=None) - .where(FlickrUser.id.between(bounds.c.lower, bounds.c.upper)) + .where(User.id.between(bounds.c.lower, bounds.c.upper)) .yield_per(1000) ) @@ -122,10 +122,8 @@ def run(self): sqlalchemy.orm.Session(self._engine) as session, session.begin(), ): - flickr_user = ( - FlickrUser.from_raw_api_data_flickrprofilegetprofile( - user_profile_downloader.get_profile_for_nsid(nsid) - ) + flickr_user = User.from_raw_api_data_flickrprofilegetprofile( + user_profile_downloader.get_profile_for_nsid(nsid) ) session.merge(flickr_user) From d9034cf012506fa7d056b2fdb86db0473a4f1d1d Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 15:51:19 +0200 Subject: [PATCH 13/29] cleaning up a bit --- src/flickrhistory/database/models/photo.py | 11 ++++++++--- src/flickrhistory/database/photo_saver.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/flickrhistory/database/models/photo.py b/src/flickrhistory/database/models/photo.py index 92b2c29..da92bca 100644 --- a/src/flickrhistory/database/models/photo.py +++ b/src/flickrhistory/database/models/photo.py @@ -51,7 +51,7 @@ class Photo(Base): ) geom = sqlalchemy.Column(geoalchemy2.Geometry("POINT", 4326)) - geographic_accuracy = sqlalchemy.Column(sqlalchemy.SmallInteger) + geo_accuracy = sqlalchemy.Column(sqlalchemy.SmallInteger) user_id = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=False) user_farm = sqlalchemy.Column(sqlalchemy.SmallInteger, nullable=False) @@ -61,8 +61,13 @@ class Photo(Base): secondary="tag_photo_associations", back_populates="photos", ) - license = sqlalchemy.Column(sqlalchemy.Integer) - geo_accuracy = sqlalchemy.Column(sqlalchemy.Integer) + + license_id = sqlalchemy.Column( + sqlalchemy.Integer, + sqlalchemy.ForeignKey("licenses.id"), + index=True, + ) + license = sqlalchemy.orm.relationship("License", back_populates="photos") user = sqlalchemy.orm.relationship("User", back_populates="photos") diff --git a/src/flickrhistory/database/photo_saver.py b/src/flickrhistory/database/photo_saver.py index c55aaf6..a781012 100644 --- a/src/flickrhistory/database/photo_saver.py +++ b/src/flickrhistory/database/photo_saver.py @@ -94,7 +94,7 @@ def save(data): ): pass - photo_data["geographical_accuracy"] = int(data["accuracy"]) + photo_data["geo_accuracy"] = int(data["accuracy"]) license = int(data["license"]) From b87321a252feb4276602cad689e7eadf1a960f79 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 15:55:54 +0200 Subject: [PATCH 14/29] database schema updates (maybe incomplete?) --- .../database/databaseschemaupdater.py | 17 ++++++++++++++++- src/flickrhistory/database/models/license.py | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/flickrhistory/database/databaseschemaupdater.py b/src/flickrhistory/database/databaseschemaupdater.py index 5fc70d2..e4f2f87 100644 --- a/src/flickrhistory/database/databaseschemaupdater.py +++ b/src/flickrhistory/database/databaseschemaupdater.py @@ -20,7 +20,22 @@ SCHEMA_UPDATES = { # 0 -> 1 1: """ - CREATE TABLE bliblably; + ALTER TABLE + photos + ADD COLUMN + geo_accuracy SMALLINT; + + CREATE TABLE + licenses ( + id INTEGER, + name TEXT, + url TEXT + ); + + ALTER TABLE + photos + ADD COLUMN + license INTEGER REFERENCES licenses(id); """, } diff --git a/src/flickrhistory/database/models/license.py b/src/flickrhistory/database/models/license.py index 1110ce8..02ddb33 100644 --- a/src/flickrhistory/database/models/license.py +++ b/src/flickrhistory/database/models/license.py @@ -17,7 +17,7 @@ class License(Base): """ORM class to represent a flickr license.""" - id = sqlalchemy.Column(sqlalchemy.Text, primary_key=True) + id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) name = sqlalchemy.Column(sqlalchemy.Text) url = sqlalchemy.Column(sqlalchemy.Text) photos = sqlalchemy.orm.relationship("Photo") From c7f0195fd928ba5aa84dffc044c292209094752f Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 16:04:31 +0200 Subject: [PATCH 15/29] clean redundant licenses, lint throughout --- src/flickrhistory/__init__.py | 15 ----------- src/flickrhistory/__main__.py | 15 ----------- src/flickrhistory/apikeymanager.py | 15 ----------- .../basicflickrhistorydownloader.py | 15 ----------- src/flickrhistory/cache.py | 14 ---------- src/flickrhistory/cacheupdaterthread.py | 15 ----------- src/flickrhistory/config.py | 14 ---------- src/flickrhistory/database/__init__.py | 2 ++ .../database/databaseschemaupdater.py | 2 +- src/flickrhistory/database/models/base.py | 14 ---------- src/flickrhistory/database/models/photo.py | 11 -------- src/flickrhistory/database/models/user.py | 11 +------- src/flickrhistory/exceptions.py | 15 ----------- .../fancyflickrhistorydownloader.py | 15 ----------- src/flickrhistory/flickrhistorydownloader.py | 15 ----------- src/flickrhistory/photodownloader.py | 15 ----------- src/flickrhistory/sigtermreceivedexception.py | 15 ----------- src/flickrhistory/timeoutlock.py | 15 ----------- src/flickrhistory/timespan.py | 15 ----------- src/flickrhistory/userprofiledownloader.py | 15 ----------- src/flickrhistory/userprofileupdaterthread.py | 27 ++----------------- 21 files changed, 6 insertions(+), 284 deletions(-) diff --git a/src/flickrhistory/__init__.py b/src/flickrhistory/__init__.py index 2d8464b..b0dd233 100644 --- a/src/flickrhistory/__init__.py +++ b/src/flickrhistory/__init__.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download a complete history of georeferenced flickr posts.""" diff --git a/src/flickrhistory/__main__.py b/src/flickrhistory/__main__.py index 6f5c213..cb0eccb 100644 --- a/src/flickrhistory/__main__.py +++ b/src/flickrhistory/__main__.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download a complete history of georeferenced flickr posts.""" diff --git a/src/flickrhistory/apikeymanager.py b/src/flickrhistory/apikeymanager.py index 7111453..b6a8e66 100644 --- a/src/flickrhistory/apikeymanager.py +++ b/src/flickrhistory/apikeymanager.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Manages API keys (and their rate limit).""" diff --git a/src/flickrhistory/basicflickrhistorydownloader.py b/src/flickrhistory/basicflickrhistorydownloader.py index 46978de..3a865fb 100644 --- a/src/flickrhistory/basicflickrhistorydownloader.py +++ b/src/flickrhistory/basicflickrhistorydownloader.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download (all) georeferenced flickr posts.""" diff --git a/src/flickrhistory/cache.py b/src/flickrhistory/cache.py index 52f5150..212a5e7 100644 --- a/src/flickrhistory/cache.py +++ b/src/flickrhistory/cache.py @@ -1,20 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . """Manages globally cached variables.""" diff --git a/src/flickrhistory/cacheupdaterthread.py b/src/flickrhistory/cacheupdaterthread.py index e2c2da7..650af39 100644 --- a/src/flickrhistory/cacheupdaterthread.py +++ b/src/flickrhistory/cacheupdaterthread.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Worker threads wrapping an APIDownloader.""" diff --git a/src/flickrhistory/config.py b/src/flickrhistory/config.py index 649e4a8..57ad9dd 100644 --- a/src/flickrhistory/config.py +++ b/src/flickrhistory/config.py @@ -1,20 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . """Import global configuration from a default location.""" diff --git a/src/flickrhistory/database/__init__.py b/src/flickrhistory/database/__init__.py index bab5938..44409b0 100644 --- a/src/flickrhistory/database/__init__.py +++ b/src/flickrhistory/database/__init__.py @@ -9,7 +9,9 @@ "Photo", "PhotoSaver", "User", + "UserSaver", ] from .models import License, Photo, User from .photo_saver import PhotoSaver +from .user_saver import UserSaver diff --git a/src/flickrhistory/database/databaseschemaupdater.py b/src/flickrhistory/database/databaseschemaupdater.py index e4f2f87..a9c94c9 100644 --- a/src/flickrhistory/database/databaseschemaupdater.py +++ b/src/flickrhistory/database/databaseschemaupdater.py @@ -34,7 +34,7 @@ ALTER TABLE photos - ADD COLUMN + ADD COLUMN license INTEGER REFERENCES licenses(id); """, } diff --git a/src/flickrhistory/database/models/base.py b/src/flickrhistory/database/models/base.py index 003370e..f38736d 100644 --- a/src/flickrhistory/database/models/base.py +++ b/src/flickrhistory/database/models/base.py @@ -1,20 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . """A common sqlalchemy declarative_base() to share between models.""" diff --git a/src/flickrhistory/database/models/photo.py b/src/flickrhistory/database/models/photo.py index da92bca..490d15a 100644 --- a/src/flickrhistory/database/models/photo.py +++ b/src/flickrhistory/database/models/photo.py @@ -6,13 +6,10 @@ __all__ = [ - "FlickrPhoto", "Photo", ] -import warnings - import geoalchemy2 import sqlalchemy import sqlalchemy.orm @@ -80,11 +77,3 @@ class Photo(Base): @sqlalchemy.orm.validates("title", "description", "tags") def _drop_nul_from_strings(self, key, address): return address.replace("\x00", "") - - -# @warnings.deprecated( -# "FlickrPhoto has been deprecated, use flickrhistory.database.models.Photo instead." -# ) -# class FlickrPhoto: -# def __new__(cls, *args, **kwargs): -# return Photo(*args, **kwargs) diff --git a/src/flickrhistory/database/models/user.py b/src/flickrhistory/database/models/user.py index e40cd2d..2f6470e 100644 --- a/src/flickrhistory/database/models/user.py +++ b/src/flickrhistory/database/models/user.py @@ -5,11 +5,10 @@ """ORM class to represent a flickr user.""" -__all__ = ["FlickrUser", "User"] +__all__ = ["User"] import datetime -import warnings import sqlalchemy import sqlalchemy.orm @@ -101,11 +100,3 @@ def from_raw_api_data_flickrprofilegetprofile(cls, data): pass return cls(**user_data) - - -# @warnings.deprecated( -# "FlickrUser has been deprecated, use flickrhistory.database.models.User instead." -# ) -# class FlickrUser: -# def __new__(cls, *args, **kwargs): -# return User(*args, **kwargs) diff --git a/src/flickrhistory/exceptions.py b/src/flickrhistory/exceptions.py index 6402939..289994c 100644 --- a/src/flickrhistory/exceptions.py +++ b/src/flickrhistory/exceptions.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Custom exceptions.""" diff --git a/src/flickrhistory/fancyflickrhistorydownloader.py b/src/flickrhistory/fancyflickrhistorydownloader.py index e40d965..e364aa3 100644 --- a/src/flickrhistory/fancyflickrhistorydownloader.py +++ b/src/flickrhistory/fancyflickrhistorydownloader.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """ Download (all) georeferenced flickr posts. diff --git a/src/flickrhistory/flickrhistorydownloader.py b/src/flickrhistory/flickrhistorydownloader.py index 40c1b39..2b83bf7 100644 --- a/src/flickrhistory/flickrhistorydownloader.py +++ b/src/flickrhistory/flickrhistorydownloader.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download (all) georeferenced flickr posts.""" diff --git a/src/flickrhistory/photodownloader.py b/src/flickrhistory/photodownloader.py index 68b775b..16a7147 100644 --- a/src/flickrhistory/photodownloader.py +++ b/src/flickrhistory/photodownloader.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download all data covering a time span from the flickr API.""" diff --git a/src/flickrhistory/sigtermreceivedexception.py b/src/flickrhistory/sigtermreceivedexception.py index eabacae..4571750 100644 --- a/src/flickrhistory/sigtermreceivedexception.py +++ b/src/flickrhistory/sigtermreceivedexception.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2018 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """An exception to handle receiving SIGTERM signals.""" diff --git a/src/flickrhistory/timeoutlock.py b/src/flickrhistory/timeoutlock.py index 78d2204..ba2134a 100644 --- a/src/flickrhistory/timeoutlock.py +++ b/src/flickrhistory/timeoutlock.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2019 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """A timer-released threading.Lock-like lock.""" diff --git a/src/flickrhistory/timespan.py b/src/flickrhistory/timespan.py index 0d4b548..519babe 100644 --- a/src/flickrhistory/timespan.py +++ b/src/flickrhistory/timespan.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """A period in time starting at a datetime and ending at another datetime.""" diff --git a/src/flickrhistory/userprofiledownloader.py b/src/flickrhistory/userprofiledownloader.py index 7ccbf35..4fd38da 100644 --- a/src/flickrhistory/userprofiledownloader.py +++ b/src/flickrhistory/userprofiledownloader.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Download all data covering a time span from the flickr API.""" diff --git a/src/flickrhistory/userprofileupdaterthread.py b/src/flickrhistory/userprofileupdaterthread.py index 5250525..0137d19 100644 --- a/src/flickrhistory/userprofileupdaterthread.py +++ b/src/flickrhistory/userprofileupdaterthread.py @@ -1,21 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2020 Christoph Fink, University of Helsinki -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 3 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - """Thread to complete missing data on user profiles.""" @@ -29,7 +14,7 @@ import sqlalchemy from .config import Config -from .database.models import User +from .database import User, UserSaver from .exceptions import ApiResponseError from .userprofiledownloader import UserProfileDownloader @@ -118,15 +103,7 @@ def run(self): while not (self.shutdown.is_set() or retries >= self.MAX_RETRIES): for nsid in self.nsids_of_users_without_detailed_information: try: - with ( - sqlalchemy.orm.Session(self._engine) as session, - session.begin(), - ): - flickr_user = User.from_raw_api_data_flickrprofilegetprofile( - user_profile_downloader.get_profile_for_nsid(nsid) - ) - session.merge(flickr_user) - + UserSaver().save(user_profile_downloader.get_profile_for_nsid(nsid)) self.count += 1 except ApiResponseError: From 1400c3283eb5c0bc80216b138eedcea968b7f8e3 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Mon, 14 Apr 2025 16:30:12 +0200 Subject: [PATCH 16/29] ... --- pyproject.toml | 3 ++- src/flickrhistory/database/engine.py | 15 +++++++++------ src/flickrhistory/database/models/__init__.py | 8 +++----- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index da96a0b..8ec0bf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,8 @@ dependencies = [ "blessed", "GeoAlchemy2", "PyYaml", - "psycopg", + "psycopg2", + "requests", "SQLAlchemy", "urllib3", ] diff --git a/src/flickrhistory/database/engine.py b/src/flickrhistory/database/engine.py index 1dd04e1..cfd3067 100644 --- a/src/flickrhistory/database/engine.py +++ b/src/flickrhistory/database/engine.py @@ -19,12 +19,15 @@ if engine.dialect.name == "postgresql": - engine.execute( - """ - CREATE EXTENSION IF NOT EXISTS - postgis; - """ - ) + with engine.connect() as connection: + connection.execute( + sqlalchemy.text( + """ + CREATE EXTENSION IF NOT EXISTS + postgis; + """ + ) + ) Session = sqlalchemy.orm.sessionmaker(engine, autoflush=False) diff --git a/src/flickrhistory/database/models/__init__.py b/src/flickrhistory/database/models/__init__.py index d5ac638..e856b30 100644 --- a/src/flickrhistory/database/models/__init__.py +++ b/src/flickrhistory/database/models/__init__.py @@ -6,8 +6,6 @@ __all__ = [ - "FlickrPhoto", - "FlickrUser", "License", "Photo", "User", @@ -19,11 +17,11 @@ from ..engine import engine from .base import Base -from .database_schema_updater import DatabaseSchemaUpdater +from ..databaseschemaupdater import DatabaseSchemaUpdater from .license import License -from .photo import FlickrPhoto, Photo +from .photo import Photo from .tag import Tag -from .user import FlickrUser, User +from .user import User if sqlalchemy.inspect(engine).has_table(Photo.__table__): # data exists From 4ac47bd82cb3c55eb15bd64e4a3ea7828165bce1 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 14:36:12 +0200 Subject: [PATCH 17/29] progress, remaining issues: - photo.date_posted not available in photodownloaderthread.py:62 - race conditions with tags/duplicate keys --- src/flickrhistory/cacheupdaterthread.py | 5 +- src/flickrhistory/database/__init__.py | 2 + .../database/databaseschemaupdater.py | 24 +++--- src/flickrhistory/database/models/__init__.py | 2 +- src/flickrhistory/database/models/base.py | 21 ++---- src/flickrhistory/database/models/photo.py | 2 +- src/flickrhistory/database/photo_saver.py | 25 ++++--- src/flickrhistory/database/user_saver.py | 75 +++++++++---------- src/flickrhistory/licensedownloader.py | 5 +- src/flickrhistory/photodownloaderthread.py | 2 +- 10 files changed, 85 insertions(+), 78 deletions(-) diff --git a/src/flickrhistory/cacheupdaterthread.py b/src/flickrhistory/cacheupdaterthread.py index 650af39..2025954 100644 --- a/src/flickrhistory/cacheupdaterthread.py +++ b/src/flickrhistory/cacheupdaterthread.py @@ -35,7 +35,10 @@ def run(self): try: newly_downloaded = self._done_queue.get(timeout=0.1) with Cache() as cache: - cache["already downloaded"] += newly_downloaded + try: + cache["already downloaded"] += newly_downloaded + except KeyError: + cache["already downloaded"] = newly_downloaded self.status = "added {}".format(newly_downloaded) except queue.Empty: if self.shutdown.is_set(): diff --git a/src/flickrhistory/database/__init__.py b/src/flickrhistory/database/__init__.py index 44409b0..900ebaf 100644 --- a/src/flickrhistory/database/__init__.py +++ b/src/flickrhistory/database/__init__.py @@ -8,10 +8,12 @@ "License", "Photo", "PhotoSaver", + "Session", "User", "UserSaver", ] +from .engine import Session from .models import License, Photo, User from .photo_saver import PhotoSaver from .user_saver import UserSaver diff --git a/src/flickrhistory/database/databaseschemaupdater.py b/src/flickrhistory/database/databaseschemaupdater.py index a9c94c9..ccc4792 100644 --- a/src/flickrhistory/database/databaseschemaupdater.py +++ b/src/flickrhistory/database/databaseschemaupdater.py @@ -22,10 +22,10 @@ 1: """ ALTER TABLE photos - ADD COLUMN + ADD COLUMN IF NOT EXISTS geo_accuracy SMALLINT; - CREATE TABLE + CREATE TABLE IF NOT EXISTS licenses ( id INTEGER, name TEXT, @@ -34,7 +34,7 @@ ALTER TABLE photos - ADD COLUMN + ADD COLUMN IF NOT EXISTS license INTEGER REFERENCES licenses(id); """, } @@ -50,14 +50,16 @@ def __init__(self): # Try to create database table for schema version with engine.begin() as connection: connection.execute( - """ - CREATE TABLE IF NOT EXISTS - schema_versions - ( - update TIMESTAMP WITH TIME ZONE DEFAULT NOW(), - version INTEGER PRIMARY KEY - ); - """ + sqlalchemy.text( + """ + CREATE TABLE IF NOT EXISTS + schema_versions + ( + update TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + version INTEGER PRIMARY KEY + ); + """ + ) ) @property diff --git a/src/flickrhistory/database/models/__init__.py b/src/flickrhistory/database/models/__init__.py index e856b30..26b7d90 100644 --- a/src/flickrhistory/database/models/__init__.py +++ b/src/flickrhistory/database/models/__init__.py @@ -24,7 +24,7 @@ from .user import User -if sqlalchemy.inspect(engine).has_table(Photo.__table__): # data exists +if sqlalchemy.inspect(engine).has_table(Photo.__table__.name): # data exists DatabaseSchemaUpdater().update_to_latest() else: Base.metadata.create_all(engine) diff --git a/src/flickrhistory/database/models/base.py b/src/flickrhistory/database/models/base.py index f38736d..a1f998b 100644 --- a/src/flickrhistory/database/models/base.py +++ b/src/flickrhistory/database/models/base.py @@ -14,8 +14,6 @@ import sqlalchemy.ext.declarative import sqlalchemy.orm -from ...config import Config - CAMEL_CASE_TO_SNAKE_CASE_RE = re.compile( "((?<=[a-z0-9])[A-Z]|(?!^)(? from photos.search + user_id, farm = data["owner"].split("@N0") + + user_data["name"] = data["ownername"] + else: + # from profile.getprofile + user_id, farm = data["id"].split("@N0") - # "joindate" and "ownername" need special attentation - try: data["join_date"] = datetime.datetime.fromtimestamp( int(data["join_date"]), tz=datetime.timezone.utc ) - except KeyError: - pass - try: - data["name"] = data["ownername"] - except KeyError: - pass - # all the other fields can be added as they are (if they exist) - for field in [ - "first_name", - "last_name", - "name", - "join_date", - "city", - "country", - "hometown", - "occupation", - "description", - "website", - "facebook", - "twitter", - "tumblr", - "instagram", - "pinterest", - ]: - try: - user_data[field] = data[field] - except KeyError: - pass + for field in [ + "first_name", + "last_name", + "name", + "join_date", + "city", + "country", + "hometown", + "occupation", + "description", + "website", + "facebook", + "twitter", + "tumblr", + "instagram", + "pinterest", + ]: + try: + user_data[field] = data[field] + except KeyError: + pass with Session() as session, session.begin(): - user = User(**user_data) - session.merge(user) - return user + user = session.get(User, (user_id, farm)) or User(id=user_id, farm=farm) + user = session.merge(user) + user.update(**user_data) + + return user diff --git a/src/flickrhistory/licensedownloader.py b/src/flickrhistory/licensedownloader.py index ffb48db..6960784 100644 --- a/src/flickrhistory/licensedownloader.py +++ b/src/flickrhistory/licensedownloader.py @@ -26,11 +26,12 @@ def __init__(self, api_key_manager): """Update the list of licenses.""" self._api_key_manager = api_key_manager - def update_license(self): + def update_licenses(self): """Update the list of licenses.""" query = { "method": "flickr.photos.licenses.getInfo", "format": "json", + "nojsoncallback": "?", } with self._api_key_manager.get_api_key() as api_key: @@ -49,7 +50,7 @@ def update_license(self): raise ApiResponseError() from exception with Session() as session, session.begin(): - for license in results["licenses"]: + for license in results["licenses"]["license"]: license_id = license["id"] license_name = license["name"] license_url = license["url"] diff --git a/src/flickrhistory/photodownloaderthread.py b/src/flickrhistory/photodownloaderthread.py index b997fd9..8dee522 100644 --- a/src/flickrhistory/photodownloaderthread.py +++ b/src/flickrhistory/photodownloaderthread.py @@ -52,7 +52,7 @@ def run(self): try: for photo in photo_downloader.photos: - PhotoSaver().save(photo) + photo = PhotoSaver().save(photo) self.count += 1 From 0ac70a01c8c533b76e9042da59d9b46ba462c0e7 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 14:39:29 +0200 Subject: [PATCH 18/29] linted --- src/flickrhistory/database/models/base.py | 4 +--- src/flickrhistory/database/photo_saver.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/flickrhistory/database/models/base.py b/src/flickrhistory/database/models/base.py index a1f998b..85f4863 100644 --- a/src/flickrhistory/database/models/base.py +++ b/src/flickrhistory/database/models/base.py @@ -46,9 +46,7 @@ def __tablename__(cls): return "{:s}s".format(snake_case) def update(self, **kwargs): - """ - Update the values of this ORM object from keyword arguments - """ + """Update the values of this ORM object from keyword arguments.""" for key, value in kwargs.items(): setattr(self, key, value) diff --git a/src/flickrhistory/database/photo_saver.py b/src/flickrhistory/database/photo_saver.py index c372f90..a61bbb8 100644 --- a/src/flickrhistory/database/photo_saver.py +++ b/src/flickrhistory/database/photo_saver.py @@ -111,9 +111,7 @@ def save(self, data): photo.tags = [] for tag in tags: - tag = session.merge( - session.get(Tag, tag) or Tag(tag=tag) - ) + tag = session.merge(session.get(Tag, tag) or Tag(tag=tag)) if tag not in photo.tags: photo.tags.append(tag) From f46b5ff14836a27ecafa828e8f82d08740cbd4e5 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 14:52:46 +0200 Subject: [PATCH 19/29] typos --- src/flickrhistory/database/models/__init__.py | 2 +- src/flickrhistory/licensedownloader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/flickrhistory/database/models/__init__.py b/src/flickrhistory/database/models/__init__.py index 26b7d90..7098090 100644 --- a/src/flickrhistory/database/models/__init__.py +++ b/src/flickrhistory/database/models/__init__.py @@ -24,7 +24,7 @@ from .user import User -if sqlalchemy.inspect(engine).has_table(Photo.__table__.name): # data exists +if sqlalchemy.inspect(engine).has_table(Photo.__table__.name): # data exist DatabaseSchemaUpdater().update_to_latest() else: Base.metadata.create_all(engine) diff --git a/src/flickrhistory/licensedownloader.py b/src/flickrhistory/licensedownloader.py index 6960784..2b5107b 100644 --- a/src/flickrhistory/licensedownloader.py +++ b/src/flickrhistory/licensedownloader.py @@ -31,7 +31,7 @@ def update_licenses(self): query = { "method": "flickr.photos.licenses.getInfo", "format": "json", - "nojsoncallback": "?", + "nojsoncallback": "", } with self._api_key_manager.get_api_key() as api_key: From 475261f1aabf34bac88335cb46435d64396c7e66 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 14:56:24 +0200 Subject: [PATCH 20/29] do not break out if key missing, but move on to next page --- src/flickrhistory/photodownloader.py | 56 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/flickrhistory/photodownloader.py b/src/flickrhistory/photodownloader.py index 16a7147..1d84ee4 100644 --- a/src/flickrhistory/photodownloader.py +++ b/src/flickrhistory/photodownloader.py @@ -74,37 +74,37 @@ def photos(self): # unsuccessful and start over raise ApiResponseError() from exception - # Check for 'photos' in results to avoid KeyError - if "photos" not in results or "photo" not in results["photos"]: - break - try: - num_photos = int(results["photos"]["total"]) - except TypeError: - num_photos = 0 - - if num_photos > 3000 and self._timespan.duration > datetime.timedelta( - seconds=1 - ): - raise DownloadBatchIsTooLargeError( - ( - "More than 3000 rows returned ({:d}), " - + "please specify a shorter time span." - ).format(num_photos) - ) - - for photo in results["photos"]["photo"]: - # the flickr API is matching date_posted very fuzzily, - # let’s not waste time with duplicates - if ( - datetime.datetime.fromtimestamp( - int(photo["dateupload"]), tz=datetime.timezone.utc - ) - > self._timespan.end + try: + num_photos = int(results["photos"]["total"]) + except TypeError: + num_photos = 0 + + if num_photos > 3000 and self._timespan.duration > datetime.timedelta( + seconds=1 ): - break + raise DownloadBatchIsTooLargeError( + ( + "More than 3000 rows returned ({:d}), " + + "please specify a shorter time span." + ).format(num_photos) + ) - yield photo + for photo in results["photos"]["photo"]: + # the flickr API is matching date_posted very fuzzily, + # let’s not waste time with duplicates + if ( + datetime.datetime.fromtimestamp( + int(photo["dateupload"]), tz=datetime.timezone.utc + ) + > self._timespan.end + ): + break + + yield photo + + except KeyError: + pass # moving on to next page, if exists page += 1 if page > int(results["photos"]["pages"]): From 08ff0eabe024d4875100fa695cb4bbb8d20326a5 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 14:58:47 +0200 Subject: [PATCH 21/29] avoid MAGIC NUMBERS --- src/flickrhistory/photodownloader.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/flickrhistory/photodownloader.py b/src/flickrhistory/photodownloader.py index 1d84ee4..59fcf0a 100644 --- a/src/flickrhistory/photodownloader.py +++ b/src/flickrhistory/photodownloader.py @@ -17,6 +17,9 @@ from .exceptions import ApiResponseError, DownloadBatchIsTooLargeError +MAX_PHOTOS_PER_BATCH = 3000 + + class PhotoDownloader: """Download all data covering a time span from the flickr API.""" @@ -80,14 +83,12 @@ def photos(self): except TypeError: num_photos = 0 - if num_photos > 3000 and self._timespan.duration > datetime.timedelta( + if num_photos > MAX_PHOTOS_PER_BATCH and self._timespan.duration > datetime.timedelta( seconds=1 ): raise DownloadBatchIsTooLargeError( - ( - "More than 3000 rows returned ({:d}), " - + "please specify a shorter time span." - ).format(num_photos) + f"More than {MAX_PHOTOS_PER_BATCH} rows returned ({num_photos}), " + "please specify a shorter time span." ) for photo in results["photos"]["photo"]: From 3b588150535e7ddbd80a75343acb0af4241a13c2 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 15:17:20 +0200 Subject: [PATCH 22/29] cleaner string formatting --- src/flickrhistory/database/models/base.py | 4 ++-- src/flickrhistory/database/photo_saver.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/flickrhistory/database/models/base.py b/src/flickrhistory/database/models/base.py index 85f4863..76e03d5 100644 --- a/src/flickrhistory/database/models/base.py +++ b/src/flickrhistory/database/models/base.py @@ -37,13 +37,13 @@ def __str__(self): primary_keys[pk.name] = getattr(self, pk.name) except AttributeError: # (not yet set) pass - return "{}({})".format(self.__class__.__name__, json.dumps(primary_keys)) + return f"<{self.__class__.__name__}({json.dumps(primary_keys)})>" @sqlalchemy.orm.declared_attr def __tablename__(cls): """Return a table name derived from the class name.""" snake_case = camel_case_to_snake_case(cls.__name__) - return "{:s}s".format(snake_case) + return f"{snake_case}s" def update(self, **kwargs): """Update the values of this ORM object from keyword arguments.""" diff --git a/src/flickrhistory/database/photo_saver.py b/src/flickrhistory/database/photo_saver.py index a61bbb8..31fd047 100644 --- a/src/flickrhistory/database/photo_saver.py +++ b/src/flickrhistory/database/photo_saver.py @@ -84,9 +84,7 @@ def save(self, data): longitude = float(data["longitude"]) latitude = float(data["latitude"]) assert longitude != 0 and latitude != 0 - photo_data["geom"] = "SRID=4326;POINT({longitude:f} {latitude:f})".format( - longitude=longitude, latitude=latitude - ) + photo_data["geom"] = f"SRID=4326;POINT({longitude:f} {latitude:f})" except ( AssertionError, # lon/lat is at exactly 0°N/S, 0°W/E -> bogus KeyError, # not contained in API dict From 7d3a5947752cfca762ca86853e55ca6bec0509bb Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 15:25:10 +0200 Subject: [PATCH 23/29] refactoring --- src/flickrhistory/photodownloader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/flickrhistory/photodownloader.py b/src/flickrhistory/photodownloader.py index 59fcf0a..16e0033 100644 --- a/src/flickrhistory/photodownloader.py +++ b/src/flickrhistory/photodownloader.py @@ -18,6 +18,7 @@ MAX_PHOTOS_PER_BATCH = 3000 +ONE_SECOND = datetime.timedelta(seconds=1) class PhotoDownloader: @@ -83,8 +84,9 @@ def photos(self): except TypeError: num_photos = 0 - if num_photos > MAX_PHOTOS_PER_BATCH and self._timespan.duration > datetime.timedelta( - seconds=1 + if ( + num_photos > MAX_PHOTOS_PER_BATCH + and self._timespan.duration > ONE_SECOND ): raise DownloadBatchIsTooLargeError( f"More than {MAX_PHOTOS_PER_BATCH} rows returned ({num_photos}), " @@ -100,7 +102,7 @@ def photos(self): ) > self._timespan.end ): - break + continue yield photo From 4f5cae73d4f10d077894ea3b224133788b6d4b92 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 15:39:54 +0200 Subject: [PATCH 24/29] api parameter --- src/flickrhistory/licensedownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/flickrhistory/licensedownloader.py b/src/flickrhistory/licensedownloader.py index 2b5107b..d0fb120 100644 --- a/src/flickrhistory/licensedownloader.py +++ b/src/flickrhistory/licensedownloader.py @@ -31,7 +31,7 @@ def update_licenses(self): query = { "method": "flickr.photos.licenses.getInfo", "format": "json", - "nojsoncallback": "", + "nojsoncallback": True, } with self._api_key_manager.get_api_key() as api_key: From 8d1a4b3886f08ca895ab409784a603e7326ff547 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 15:40:05 +0200 Subject: [PATCH 25/29] database concurrency issues --- src/flickrhistory/database/engine.py | 11 ++++++++++- src/flickrhistory/database/photo_saver.py | 3 ++- src/flickrhistory/database/user_saver.py | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/flickrhistory/database/engine.py b/src/flickrhistory/database/engine.py index cfd3067..4d4d218 100644 --- a/src/flickrhistory/database/engine.py +++ b/src/flickrhistory/database/engine.py @@ -8,14 +8,23 @@ __all__ = ["engine", "Session"] +import multiprocessing + import sqlalchemy import sqlalchemy.orm from ..config import Config +POOL_SIZE = multiprocessing.cpu_count() + + with Config() as config: - engine = sqlalchemy.create_engine(config["database_connection_string"]) + engine = sqlalchemy.create_engine( + config["database_connection_string"], + pool_size=POOL_SIZE, + max_overflow=POOL_SIZE, + ) if engine.dialect.name == "postgresql": diff --git a/src/flickrhistory/database/photo_saver.py b/src/flickrhistory/database/photo_saver.py index 31fd047..c30132c 100644 --- a/src/flickrhistory/database/photo_saver.py +++ b/src/flickrhistory/database/photo_saver.py @@ -116,7 +116,8 @@ def save(self, data): license = session.merge( session.get(License, license) or License(id=license) ) - session.flush() photo.license = license + session.flush() + session.expunge(photo) return photo diff --git a/src/flickrhistory/database/user_saver.py b/src/flickrhistory/database/user_saver.py index 406027e..fe8d1ca 100644 --- a/src/flickrhistory/database/user_saver.py +++ b/src/flickrhistory/database/user_saver.py @@ -62,4 +62,6 @@ def save(self, data): user = session.merge(user) user.update(**user_data) + session.flush() + session.expunge(user) return user From 3c9acbe036d1ebececc0d60f012cd2d618470470 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 17:21:59 +0200 Subject: [PATCH 26/29] update photos that lack tags, accuracy, or license --- .../basicflickrhistorydownloader.py | 11 +- src/flickrhistory/database/engine.py | 2 +- src/flickrhistory/photoupdater.py | 76 ++++++++++++ src/flickrhistory/photoupdaterthread.py | 115 ++++++++++++++++++ src/flickrhistory/userprofileupdaterthread.py | 10 +- 5 files changed, 203 insertions(+), 11 deletions(-) create mode 100644 src/flickrhistory/photoupdater.py create mode 100644 src/flickrhistory/photoupdaterthread.py diff --git a/src/flickrhistory/basicflickrhistorydownloader.py b/src/flickrhistory/basicflickrhistorydownloader.py index c278e1e..227a7a7 100644 --- a/src/flickrhistory/basicflickrhistorydownloader.py +++ b/src/flickrhistory/basicflickrhistorydownloader.py @@ -23,6 +23,7 @@ from .config import Config from .licensedownloader import LicenseDownloader from .photodownloaderthread import PhotoDownloaderThread +from .photoupdaterthread import PhotoUpdaterThread from .sigtermreceivedexception import SigTermReceivedException from .timespan import TimeSpan from .userprofileupdaterthread import UserProfileUpdaterThread @@ -31,7 +32,7 @@ class BasicFlickrHistoryDownloader: """Download (all) georeferenced flickr posts.""" - NUM_WORKERS = multiprocessing.cpu_count() + 1 # 1 == user_profile_updater + NUM_WORKERS = multiprocessing.cpu_count() NUM_MANAGERS = 2 # main thread + cache_updater # if output into pipe (e.g. logger, systemd), then @@ -77,6 +78,14 @@ def download(self): worker.start() self._worker_threads.append(worker) + # start photo record updaters + for i in range(self.NUM_WORKERS): + worker = PhotoUpdaterThread( + self._api_key_manager, (i + 1, self.NUM_WORKERS) + ) + worker.start() + self._worker_threads.append(worker) + # start cache updater self._cache_updater_thread = CacheUpdaterThread(self._done_queue) self._cache_updater_thread.start() diff --git a/src/flickrhistory/database/engine.py b/src/flickrhistory/database/engine.py index 4d4d218..a36cd75 100644 --- a/src/flickrhistory/database/engine.py +++ b/src/flickrhistory/database/engine.py @@ -16,7 +16,7 @@ from ..config import Config -POOL_SIZE = multiprocessing.cpu_count() +POOL_SIZE = multiprocessing.cpu_count() * 10 with Config() as config: diff --git a/src/flickrhistory/photoupdater.py b/src/flickrhistory/photoupdater.py new file mode 100644 index 0000000..4597991 --- /dev/null +++ b/src/flickrhistory/photoupdater.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""Download all data covering a time span from the flickr API.""" + + +__all__ = ["PhotoUpdater"] + + +import json + +import requests +import urllib3 + +from .exceptions import ApiResponseError + + +class PhotoUpdater: + """ + Download photo data from the flickr API. + + Photo data downloaded with flickrhistory<0.3.0 do not contain information on + geo accuracy, license, tags. This re-fetches that information. + """ + + API_ENDPOINT_URL = "https://api.flickr.com/services/rest/" + + def __init__(self, api_key_manager): + """Intialize an PhotoUpdater.""" + self._api_key_manager = api_key_manager + + def get_info_for_photo_id(self, photo_id): + """Get profile data by photo_id.""" + query = { + "method": "flickr.photos.getInfo", + "format": "json", + "nojsoncallback": True, + "photo_id": photo_id, + } + + params = {} + with self._api_key_manager.get_api_key() as api_key: + params["api_key"] = api_key + params.update(query) + + try: + with requests.get(self.API_ENDPOINT_URL, params=params) as response: + results = response.json() + assert "photo" in results + + data = { + "id": photo_id, + "tags": " ".join([tag["_content"] for tag in results["photo"]["tags"]["tag"]]), + "license": int(results["photo"]["license"]), + "accuracy": int(results["photo"]["location"]["accuracy"]), + + "owner": results["photo"]["owner"]["nsid"], + "ownername": results["photo"]["owner"]["realname"], + } + + except ( + ConnectionError, + json.decoder.JSONDecodeError, + requests.exceptions.RequestException, + urllib3.exceptions.HTTPError, + ) as exception: + # API hicups, let’s consider this batch + # unsuccessful and start over + raise ApiResponseError() from exception + + except AssertionError: + # if API hicups, return a stub data dict + data = {"id": photo_id} + + return data diff --git a/src/flickrhistory/photoupdaterthread.py b/src/flickrhistory/photoupdaterthread.py new file mode 100644 index 0000000..c47b2be --- /dev/null +++ b/src/flickrhistory/photoupdaterthread.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +"""Thread to complete missing data on photos.""" + + +__all__ = ["PhotoUpdaterThread"] + + +import threading +import time + +import sqlalchemy + +from .config import Config +from .database import Photo, PhotoSaver, Session +from .exceptions import ApiResponseError +from .photoupdater import PhotoUpdater + + +class PhotoUpdaterThread(threading.Thread): + """Finds incomplete photos and downloads missing data from the flickr API.""" + def __init__(self, api_key_manager, partition=None): + """ + Intialize a PhotoUpdaterThread. + + Args: + api_key_manager: instance of an ApiKeyManager + partition (tuple of int): download the n-th of m parts of incomplete photos + + """ + super().__init__() + + self.count = 0 + + self._api_key_manager = api_key_manager + try: + part, number_of_partitions = partition + assert part > 0 + assert part <= number_of_partitions + self._bounds = ( + (part - 1) * 1.0 / number_of_partitions, + part * 1.0 / number_of_partitions, + ) + except (AssertionError, TypeError): + self._bounds = None + + self.shutdown = threading.Event() + + with Config() as config: + self._engine = sqlalchemy.create_engine( + config["database_connection_string"] + ) + + @property + def ids_of_photos_without_detailed_information(self): + """Find ids of incomplete photo profiles.""" + # Find id of incomplete photo records + # We use geo_accuracy IS NULL + with Session() as session: + if self._bounds is None: + ids_of_photos_without_detailed_information = session.query( + Photo.id + ).filter_by(geo_accuracy=None) + else: + bounds = ( + sqlalchemy.select( + sqlalchemy.sql.functions.percentile_disc(self._bounds[0]) + .within_group(Photo.id) + .label("lower"), + sqlalchemy.sql.functions.percentile_disc(self._bounds[1]) + .within_group(Photo.id) + .label("upper"), + ) + .select_from(Photo) + .filter_by(geo_accuracy=None) + .cte() + ) + ids_of_photos_without_detailed_information = ( + session.query(Photo.id) + .filter_by(geo_accuracy=None) + .where(Photo.id.between(bounds.c.lower, bounds.c.upper)) + .yield_per(1000) + ) + + for (id,) in ids_of_photos_without_detailed_information: + yield id + + def run(self): + """Get TimeSpans off todo_queue and download photos.""" + photo_updater = PhotoUpdater(self._api_key_manager) + + while not self.shutdown.is_set(): + for photo_id in self.ids_of_photos_without_detailed_information: + try: + PhotoSaver().save(photo_updater.get_info_for_photo_id(photo_id)) + self.count += 1 + + except ApiResponseError: + # API returned some bogus/none-JSON data, + # let’s try again later + continue + + if self.shutdown.is_set(): + break + + # once no incomplete photo profiles remain, + # wait for ten minutes before trying again; + # wake up every 1/10 sec to check whether we + # should shut down + for _ in range(10 * 60 * 10): + if self.shutdown.is_set(): + break + time.sleep(0.1) diff --git a/src/flickrhistory/userprofileupdaterthread.py b/src/flickrhistory/userprofileupdaterthread.py index 0137d19..1ea9fe6 100644 --- a/src/flickrhistory/userprofileupdaterthread.py +++ b/src/flickrhistory/userprofileupdaterthread.py @@ -21,11 +21,6 @@ class UserProfileUpdaterThread(threading.Thread): """Finds incomplete user profiles and downloads missing data from the flickr API.""" - - MAX_RETRIES = ( - 5 # once all users have been updated, retry this times (with 10 min breaks) - ) - def __init__(self, api_key_manager, partition=None): """ Intialize a UserProfileUpdateThread. @@ -98,9 +93,7 @@ def run(self): """Get TimeSpans off todo_queue and download photos.""" user_profile_downloader = UserProfileDownloader(self._api_key_manager) - retries = 0 - - while not (self.shutdown.is_set() or retries >= self.MAX_RETRIES): + while not self.shutdown.is_set(): for nsid in self.nsids_of_users_without_detailed_information: try: UserSaver().save(user_profile_downloader.get_profile_for_nsid(nsid)) @@ -122,4 +115,3 @@ def run(self): if self.shutdown.is_set(): break time.sleep(0.1) - retries += 1 From 20e57122d0058642afe2204d71431e98f7f6c839 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 17:27:50 +0200 Subject: [PATCH 27/29] prepare for v0.3.0 --- CHANGELOG.md | 7 +++++++ src/flickrhistory/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76f393a..6a6d1e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +- **0.3.0** (2025-04-15): + - now collecting more data: tags, license, geo-accuracy + - re-downloading these data for existing records + - database schema versioning + - moved to PyPi TrustedPublisher auth + - migrated to pyproject.toml-only style + - **0.2.1** (2024-01-08): - migrate to Digital Geography Lab’s GitHub diff --git a/src/flickrhistory/__init__.py b/src/flickrhistory/__init__.py index b0dd233..ac0c124 100644 --- a/src/flickrhistory/__init__.py +++ b/src/flickrhistory/__init__.py @@ -15,4 +15,4 @@ except ImportError: pass -__version__ = "0.2.1" +__version__ = "0.3.0" From f26ddfc0a555ba1da0f4c56ad6f441d7db66b0b7 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 17:29:11 +0200 Subject: [PATCH 28/29] linted --- src/flickrhistory/photoupdater.py | 5 +++-- src/flickrhistory/photoupdaterthread.py | 1 + src/flickrhistory/userprofileupdaterthread.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/flickrhistory/photoupdater.py b/src/flickrhistory/photoupdater.py index 4597991..7e2fb18 100644 --- a/src/flickrhistory/photoupdater.py +++ b/src/flickrhistory/photoupdater.py @@ -51,10 +51,11 @@ def get_info_for_photo_id(self, photo_id): data = { "id": photo_id, - "tags": " ".join([tag["_content"] for tag in results["photo"]["tags"]["tag"]]), + "tags": " ".join( + [tag["_content"] for tag in results["photo"]["tags"]["tag"]] + ), "license": int(results["photo"]["license"]), "accuracy": int(results["photo"]["location"]["accuracy"]), - "owner": results["photo"]["owner"]["nsid"], "ownername": results["photo"]["owner"]["realname"], } diff --git a/src/flickrhistory/photoupdaterthread.py b/src/flickrhistory/photoupdaterthread.py index c47b2be..3849fdc 100644 --- a/src/flickrhistory/photoupdaterthread.py +++ b/src/flickrhistory/photoupdaterthread.py @@ -21,6 +21,7 @@ class PhotoUpdaterThread(threading.Thread): """Finds incomplete photos and downloads missing data from the flickr API.""" + def __init__(self, api_key_manager, partition=None): """ Intialize a PhotoUpdaterThread. diff --git a/src/flickrhistory/userprofileupdaterthread.py b/src/flickrhistory/userprofileupdaterthread.py index 1ea9fe6..ab89bcf 100644 --- a/src/flickrhistory/userprofileupdaterthread.py +++ b/src/flickrhistory/userprofileupdaterthread.py @@ -21,6 +21,7 @@ class UserProfileUpdaterThread(threading.Thread): """Finds incomplete user profiles and downloads missing data from the flickr API.""" + def __init__(self, api_key_manager, partition=None): """ Intialize a UserProfileUpdateThread. From 43415920d36a46a0eaf966ac46ef9d62a64a5bc0 Mon Sep 17 00:00:00 2001 From: Christoph Fink Date: Tue, 15 Apr 2025 17:42:31 +0200 Subject: [PATCH 29/29] v0.3.0.dev0 --- src/flickrhistory/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/flickrhistory/__init__.py b/src/flickrhistory/__init__.py index ac0c124..fdbe103 100644 --- a/src/flickrhistory/__init__.py +++ b/src/flickrhistory/__init__.py @@ -15,4 +15,4 @@ except ImportError: pass -__version__ = "0.3.0" +__version__ = "0.3.0.dev0"