Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .config/mise.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tools]
python = "3.13.3"
python = "3.14.1"
node = "latest"
lefthook = "latest"
yamllint = "latest"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ __pycache__/
**/.xlsx#
*.parquet
**/.parquet
output/
30 changes: 0 additions & 30 deletions enrichers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,13 @@
"""

import copy
import requests
from schemas import enrich_resp_schema
import time
from utils import (
default_headers,
session,
)


class Enrichment(object):
_required_keys = [
"facility_name",
]
# in seconds
_wait_time: float = 1

def __init__(self, **kwargs):
self.resp_info = copy.deepcopy(enrich_resp_schema)
Expand All @@ -32,28 +24,6 @@ def search(self) -> dict:
"""Child objects should implement this"""
return {}

def _req(self, url: str, **kwargs) -> requests.Response:
"""requests response wrapper to ensure we honor waits"""
headers = kwargs.get("headers", {})
# ensure we get all headers configured correctly
# but manually applied headers win the argument
for k, v in default_headers.items():
if k in headers.keys():
continue
headers[k] = v

response = session.get(
url,
allow_redirects=True,
timeout=kwargs.get("timeout", 10),
params=kwargs.get("params", {}),
stream=kwargs.get("stream", False),
headers=headers,
)
response.raise_for_status()
time.sleep(self._wait_time)
return response

def _minimal_clean_facility_name(self, name: str) -> str:
"""Minimal cleaning that preserves important context like 'County Jail'"""
cleaned = name
Expand Down
12 changes: 5 additions & 7 deletions enrichers/openstreetmap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enrichers import Enrichment
from utils import logger
from utils import logger, req_get


class OpenStreetMap(Enrichment):
Expand Down Expand Up @@ -40,13 +40,13 @@ def search(self) -> dict:
"dedupe": 1,
},
"street_address": {
"q": f"{full_address}",
"q": full_address,
"format": "json",
"limit": 5,
"dedupe": 1,
},
"locality": {
"q": f"{locality}",
"q": locality,
"format": "json",
"limit": 5,
"dedupe": 1,
Expand All @@ -56,7 +56,7 @@ def search(self) -> dict:
logger.debug("Searching OSM for %s", params["q"])
self.resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
try:
response = self._req(search_url, params=params, timeout=15)
response = req_get(search_url, params=params, timeout=15)
data.extend(response.json())
except Exception as e:
logger.debug(" OSM search error for '%s': %s", facility_name, e)
Expand All @@ -73,10 +73,8 @@ def search(self) -> dict:
lon = first_result.get("lon", self.default_coords["longitude"])
osm_type = first_result.get("osm_type", "")
osm_id = first_result.get("osm_id", "")
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
self.resp_info["title"] = first_result.get("display_name", "")
self.resp_info["details"]["class"] = first_result.get("class", "") # type: ignore [index]
self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
if osm_type == "way":
self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
else:
Expand Down
50 changes: 27 additions & 23 deletions enrichers/wikidata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enrichers import Enrichment
from utils import logger
from utils import logger, req_get


class Wikidata(Enrichment):
Expand All @@ -11,29 +11,32 @@ def search(self) -> dict:
# Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
# Falls back to first result (usually truncated, eg. county)
search_name_fallback = self._clean_facility_name(facility_name)
self.resp_info["enrichment_type"] = "wikidata"
logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
search_url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbsearchentities",
"search": facility_name,
"language": "en",
"format": "json",
"limit": 3,
"facility_name": {
"action": "wbsearchentities",
"search": facility_name,
"language": "en",
"format": "json",
"limit": 3,
},
"fallback": {
"action": "wbsearchentities",
"search": search_name_fallback,
"language": "en",
"format": "json",
"limit": 3,
},
}
self.resp_info["enrichment_type"] = "wikidata"
data = {}
try:
response = self._req(search_url, params=params)
data = response.json()
except Exception as e:
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
if not data.get("search"):
params["search"] = search_name_fallback
self.resp_info["search_query_steps"].append(search_name_fallback) # type: ignore [attr-defined]
for search, params in params.items():
self.resp_info["search_query_steps"].append(params["search"]) # type: ignore [attr-defined]
try:
response = self._req(search_url, params=params)
response = req_get(search_url, params=params, wait_time=self._wait_time)
data = response.json()
break
except Exception as e:
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
Expand All @@ -45,10 +48,11 @@ def search(self) -> dict:
if any(term in description for term in match_terms):
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
self.resp_info["title"] = result.get("label", "")
return self.resp_info
# fallback to first result
first = data["search"][0]
logger.debug(" Closer matching failed, falling back to first result %s", first)
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
self.resp_info["title"] = result.get("label", "")
break
else:
# fall back to first result
first = data["search"][0]
logger.debug(" Closer matching failed, falling back to first result %s", first)
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
self.resp_info["title"] = first.get("label", "")
return self.resp_info
10 changes: 5 additions & 5 deletions enrichers/wikipedia.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enrichers import Enrichment
from urllib.parse import quote
from utils import logger
from utils import logger, req_get


class Wikipedia(Enrichment):
Expand Down Expand Up @@ -32,15 +32,15 @@ def search(self) -> dict:
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
initial_response = False
try:
response = self._req(wiki_url)
response = req_get(wiki_url, wait_time=self._wait_time)
initial_response = True
except Exception as e:
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
try:
response = self._req(wiki_url)
response = req_get(wiki_url, wait_time=self._wait_time)
initial_response = True
except Exception as e:
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
Expand Down Expand Up @@ -101,7 +101,7 @@ def search(self) -> dict:
}

try:
response = self._req(self.api_search, params=params)
response = req_get(self.api_search, params=params, wait_time=self._wait_time)
data = response.json()
except Exception as e:
logger.debug(" Wikipedia search for %s failed: %s", self.api_search, e)
Expand Down Expand Up @@ -161,7 +161,7 @@ def search(self) -> dict:

# Verify the page exists and isn't a redirect to something unrelated
try:
verify_response = self._req(final_url)
verify_response = req_get(final_url, wait_time=self._wait_time)
except Exception as e:
logger.debug(" Wikipedia query for %s failed: %s", final_url, e)
self.resp_info["search_query_steps"].append(final_url) # type: ignore [attr-defined]
Expand Down
4 changes: 1 addition & 3 deletions file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ def export_to_file(
if not facilities_data or not facilities_data.get("facilities", []):
logger.warning("No data to export!")
return ""
# make sure the folder we're dropping files into exists
os.makedirs(output_folder, exist_ok=True)
full_name = f"{output_folder}/{filename}.{file_type}"
full_name = f"{output_folder}{os.sep}{filename}.{file_type}"
if file_type in ["csv", "xlsx", "parquet"]:
writer = convert_to_dataframe(facilities_data["facilities"])
match file_type:
Expand Down
9 changes: 4 additions & 5 deletions ice_scrapers/agencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,18 @@
import time
from utils import (
logger,
session,
output_folder,
req_get,
)
from .utils import download_file

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"


def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
"""Collect data on participating agencies"""
start_time = time.time()
resp = session.get(base_xlsx_url, timeout=120)
resp.raise_for_status()
resp = req_get(base_xlsx_url, timeout=120)
soup = BeautifulSoup(resp.content, "html.parser")
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
if not links:
Expand All @@ -45,7 +44,7 @@ def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dic
"""
# remove the date so we can easily overwrite the local (cached) file
filename = date_re.sub("", link.split("/")[-1])
path = f"{SCRIPT_DIR}{os.sep}{filename}"
path = f"{output_folder}{os.sep}{filename}"
if force_download or not os.path.exists(path):
logger.info("Downloading agency info sheet from %s", link)
download_file(link, path)
Expand Down
30 changes: 18 additions & 12 deletions ice_scrapers/facilities_scraper.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from bs4 import BeautifulSoup
import copy
import datetime
import re
from schemas import facility_schema
import time

from bs4 import BeautifulSoup

from schemas import facility_schema
from utils import (
default_timestamp,
logger,
session,
req_get,
timestamp_format,
)

from .utils import (
get_ice_scrape_pages,
repair_locality,
repair_name,
repair_street,
repair_zip,
repair_name,
special_facilities,
update_facility,
)
Expand All @@ -33,6 +36,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
scraped_count = 0
for page_num, url in enumerate(urls):
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
facilities = []
try:
facilities = _scrape_page(url)
except Exception as e:
Expand All @@ -43,19 +47,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
for facility in facilities:
facility = special_facilities(facility)
addr = facility["address"]
street, cleaned = repair_street(addr["street"], addr["locality"])
street, cleaned, other_st = repair_street(addr["street"], addr["locality"])
addr["other_streets"].extend(other_st)
if cleaned:
addr["street"] = street
facility["_repaired_record"] = True
zcode, cleaned = repair_zip(addr["postal_code"], addr["locality"])
zcode, cleaned, other_zip = repair_zip(addr["postal_code"], addr["locality"])
addr["other_postal_codes"].extend(other_zip)
if cleaned:
addr["postal_code"] = zcode
facility["_repaired_record"] = True
locality, cleaned = repair_locality(addr["locality"], addr["administrative_area"])
locality, cleaned, other_city = repair_locality(addr["locality"], addr["administrative_area"])
addr["other_localities"].extend(other_city)
if cleaned:
addr["locality"] = locality
facility["_repaired_record"] = True
name, cleaned = repair_name(facility["name"], addr["locality"])
name, cleaned, other_name = repair_name(facility["name"], addr["locality"])
facility["other_names"].extend(other_name)
if cleaned:
facility["name"] = name
facility["_repaired_record"] = True
Expand Down Expand Up @@ -95,8 +103,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
return datetime.datetime.strptime(default_timestamp, timestamp_format)
logger.debug(" Fetching: %s", url)
try:
response = session.get(url, timeout=30)
response.raise_for_status()
response = req_get(url, timeout=30, wait_time=0.1)
except Exception as e:
logger.error(" Error parsing %s: %s", url, e)
return datetime.datetime.strptime(default_timestamp, timestamp_format)
Expand All @@ -118,8 +125,7 @@ def _scrape_page(page_url: str) -> list:
"""Scrape a single page of facilities using BeautifulSoup"""
logger.debug(" Fetching: %s", page_url)
try:
response = session.get(page_url, timeout=30)
response.raise_for_status()
response = req_get(page_url, timeout=30, wait_time=0.1)
except Exception as e:
logger.error(" Error parsing %s: %s", page_url, e)
return []
Expand Down
5 changes: 2 additions & 3 deletions ice_scrapers/field_offices.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import time
from utils import (
logger,
session,
req_get,
)
from .utils import get_ice_scrape_pages

Expand Down Expand Up @@ -45,8 +45,7 @@ def _scrape_page(page_url: str) -> list[dict]:
"""Scrape a single page of facilities using BeautifulSoup"""
logger.debug(" Fetching: %s", page_url)
try:
response = session.get(page_url, timeout=30)
response.raise_for_status()
response = req_get(page_url, timeout=30)
except Exception as e:
logger.error(" Error parsing %s: %s", page_url, e)
return []
Expand Down
Loading