From 0cd61e92c1fd062a0333d34b5d83ad14bdd54891 Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 16 Dec 2025 01:05:17 +0100 Subject: [PATCH 1/5] Meilleure gestion des erreurs dans get_etablissements --- src/tasks/get.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/tasks/get.py b/src/tasks/get.py index c472a7c..59e057c 100644 --- a/src/tasks/get.py +++ b/src/tasks/get.py @@ -9,7 +9,7 @@ import ijson import orjson import polars as pl -from httpx import Client, HTTPStatusError, TimeoutException, get +from httpx import Client, get from lxml import etree, html from prefect.transactions import transaction from tenacity import ( @@ -397,22 +397,19 @@ def get_etablissements() -> pl.LazyFrame: hrefs.append(base_url + href) # Fonction de traitement pour un fichier + @retry( + stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=1, max=20) + ) def get_process_file(_href: str): - print(_href.split("/")[-1]) - try: - response = http_client.get( - _href, headers=HTTP_HEADERS, timeout=20 - ).raise_for_status() - except (HTTPStatusError, TimeoutException) as err: - print(err) - print("Nouvel essai...") - response = http_client.get( - _href, headers=HTTP_HEADERS, timeout=20 - ).raise_for_status() + response = http_client.get( + _href, headers=HTTP_HEADERS, timeout=30 + ).raise_for_status() content = response.content lff = pl.scan_csv(content, schema_overrides=schema) lff = lff.select(columns) + print(_href.split("/")[-1], "OK") + return lff # Traitement en parrallèle avec 8 threads From 069229b7a81b3dfffec72e59cca54b718ad6da0b Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 16 Dec 2025 01:14:47 +0100 Subject: [PATCH 2/5] Retry en cas de timeout pour stream_get --- src/tasks/get.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/tasks/get.py b/src/tasks/get.py index 59e057c..b2784f0 100644 --- a/src/tasks/get.py +++ b/src/tasks/get.py @@ -14,7 +14,6 @@ from prefect.transactions import transaction from tenacity import ( retry, - retry_if_exception_type, stop_after_attempt, wait_exponential, ) @@ -44,16 +43,12 @@ ) -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=retry_if_exception_type(httpx.HTTPError), # On ne retry que sur erreur http -) +@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=20)) def stream_get(url: str, chunk_size=1024**2): # chunk_size en octets (1 Mo par défaut) if url.startswith("http"): try: with HTTP_CLIENT.stream( - "GET", url, headers=HTTP_HEADERS, follow_redirects=True + "GET", url, headers=HTTP_HEADERS, follow_redirects=True, timeout=20 ) as response: yield from response.iter_bytes(chunk_size) except httpx.TooManyRedirects: From c8d8b22dd36700842b404788b845db2d178703dc Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 16 Dec 2025 15:14:48 +0100 Subject: [PATCH 3/5] deploy.py => deployments.py --- src/{deploy.py => deployments.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{deploy.py => deployments.py} (100%) diff --git a/src/deploy.py b/src/deployments.py similarity index 100% rename from src/deploy.py rename to src/deployments.py From 71f48f580c46cb758fce90869b284e820236f69b Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 16 Dec 2025 15:15:07 +0100 Subject: [PATCH 4/5] Changelog 2.6.3 --- README.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9153400..acf4ed0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DECP processing -> version 2.6.2 ([notes de version](https://github.com/ColinMaudry/decp-processing/blob/main/CHANGELOG.md)) +> version 2.6.3 ([notes de version](https://github.com/ColinMaudry/decp-processing/blob/main/CHANGELOG.md)) Projet de traitement et de publication de meilleures données sur les marchés publics attribués en France. Vous pouvez consulter, filtrer et télécharger ces données sur le site [decp.info](https://decp.info). Enfin la section [À propos](https://decp.info/a-propos) décrit les objectifs du projet et regroupe toutes les informations clés. diff --git a/pyproject.toml b/pyproject.toml index ca90f06..cb39d1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "decp-processing" description = "Traitement des données des marchés publics français." -version = "2.6.2" +version = "2.6.3" requires-python = ">= 3.9" authors = [ { name = "Colin Maudry", email = "colin+decp@maudry.com" } From f2fcab9e9bf9ab2de492c7039b17f419e8894f7b Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 16 Dec 2025 15:48:47 +0100 Subject: [PATCH 5/5] Changelog 2.6.3 --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 473ec5f..bc69746 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### 2.6.3 2025-12-16 + +- Téléchargement des ressources plus résilient aux erreurs ([tenacity](https://tenacity.readthedocs.io/en/latest/)) +- Téléchargement des données établissements plus résilient aux erreurs ([tenacity](https://tenacity.readthedocs.io/en/latest/)) + ### 2.6.2 2025-12-15 - Réduction du nombre de tâches prefect pour réduire la charge sur la BDD et la latence