From 3cb13a3d1c3413ee785da3f201474fb9d3976d75 Mon Sep 17 00:00:00 2001 From: JurekBauer Date: Tue, 9 Dec 2025 21:23:24 +0100 Subject: [PATCH] changes to understat; use api --- nbs/understat.ipynb | 97 ++++++++++++++++++++++++++++++++----- understatdb/understat.py | 101 +++++++++++++++++++++++++++++++++------ 2 files changed, 172 insertions(+), 26 deletions(-) diff --git a/nbs/understat.ipynb b/nbs/understat.ipynb index 5e644ed..7c96e05 100644 --- a/nbs/understat.ipynb +++ b/nbs/understat.ipynb @@ -57,21 +57,46 @@ "\n", "def fetch_html(url):\n", " \"\"\"\n", - " Fetch HTML and decode into a `bs4.BeautifulSoup` object\n", + " Fetch HTML and decode into a `bs4.BeautifulSoup` object.\n", + " Returns None if the page returns a 404 error (page not found).\n", " \"\"\"\n", - " r = requests.get(url)\n", - " r.raise_for_status()\n", - " return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n", + " try:\n", + " r = requests.get(url)\n", + " r.raise_for_status()\n", + " return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n", + " except requests.exceptions.HTTPError as e:\n", + " if e.response.status_code == 404:\n", + " # Return None for 404 errors (page not found)\n", + " return None\n", + " else:\n", + " # Re-raise other HTTP errors\n", + " raise\n", " \n", " \n", "def extract_json(soup, json_var):\n", " \"\"\" Extract a JSON variable from understat HTML. \"\"\"\n", - " node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]\n", + " if soup is None:\n", + " raise ValueError(f\"Cannot extract {json_var}: page not found (404)\")\n", + " \n", + " matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string]\n", + " \n", + " if not matching_scripts:\n", + " raise ValueError(\n", + " f\"Cannot find variable '{json_var}' in page HTML. \"\n", + " \"Understat may have changed their website structure to load data dynamically. \"\n", + " \"You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load.\"\n", + " )\n", + " \n", + " node = matching_scripts[0]\n", " \n", " # Clean string by removing and newlines (\\n) and tabs (\\t)\n", " node_string = ' '.join(node.string.split())\n", " \n", - " json_value = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P.*?)\\'\\)\", node_string).group('json')\n", + " match = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P.*?)\\'\\)\", node_string)\n", + " if not match:\n", + " raise ValueError(f\"Could not parse {json_var} from script tag. Pattern may have changed.\")\n", + " \n", + " json_value = match.group('json')\n", " return json.loads(json_value)" ] }, @@ -125,14 +150,62 @@ " \n", " def matches(self, league: League, season: int):\n", " \"\"\" Fetch match data for a given `league` and `season` (start year). \"\"\"\n", - " league_url = f'{self.base_url}/league/{league.value}/{season}'\n", - " soup = fetch_html(league_url)\n", - " return extract_json(soup, 'datesData')\n", + " # Use the API endpoint directly instead of parsing HTML\n", + " api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}'\n", + " \n", + " # Headers required for the API to work\n", + " headers = {\n", + " 'Referer': f'{self.base_url}/league/{league.value}/{season}',\n", + " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n", + " 'Accept': 'application/json, text/javascript, */*; q=0.01',\n", + " 'X-Requested-With': 'XMLHttpRequest',\n", + " }\n", + " \n", + " try:\n", + " r = requests.get(api_url, headers=headers, timeout=30)\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " # Return the dates data (matches)\n", + " return data.get('dates', [])\n", + " except requests.exceptions.HTTPError as e:\n", + " if e.response.status_code == 404:\n", + " # Fallback to old method for backwards compatibility\n", + " league_url = f'{self.base_url}/league/{league.value}/{season}'\n", + " soup = fetch_html(league_url)\n", + " return extract_json(soup, 'datesData')\n", + " else:\n", + " raise\n", " \n", " def shots(self, match_id: int):\n", - " match_url = f'{self.base_url}/match/{match_id}'\n", - " soup = fetch_html(match_url)\n", - " return extract_json(soup, 'shotsData')" + " \"\"\" Fetch shot data for a given `match_id`. \"\"\"\n", + " # Use the API endpoint directly instead of parsing HTML\n", + " api_url = f'{self.base_url}/getMatchData/{match_id}'\n", + " \n", + " # Headers required for the API to work\n", + " headers = {\n", + " 'Referer': f'{self.base_url}/match/{match_id}',\n", + " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n", + " 'Accept': 'application/json, text/javascript, */*; q=0.01',\n", + " 'X-Requested-With': 'XMLHttpRequest',\n", + " }\n", + " \n", + " try:\n", + " r = requests.get(api_url, headers=headers, timeout=30)\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " # Return the shots data\n", + " return data.get('shots', {\"h\": [], \"a\": []})\n", + " except requests.exceptions.HTTPError as e:\n", + " if e.response.status_code == 404:\n", + " # Match doesn't exist - return empty shots structure\n", + " return {\"h\": [], \"a\": []}\n", + " else:\n", + " # Fallback to old method for backwards compatibility\n", + " match_url = f'{self.base_url}/match/{match_id}'\n", + " soup = fetch_html(match_url)\n", + " if soup is None:\n", + " return {\"h\": [], \"a\": []}\n", + " return extract_json(soup, 'shotsData')" ] }, { diff --git a/understatdb/understat.py b/understatdb/understat.py index 14af925..9eb5bd9 100644 --- a/understatdb/understat.py +++ b/understatdb/understat.py @@ -13,21 +13,46 @@ def fetch_html(url): """ - Fetch HTML and decode into a `bs4.BeautifulSoup` object + Fetch HTML and decode into a `bs4.BeautifulSoup` object. + Returns None if the page returns a 404 error (page not found). """ - r = requests.get(url) - r.raise_for_status() - return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser') + try: + r = requests.get(url) + r.raise_for_status() + return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser') + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + # Return None for 404 errors (page not found) + return None + else: + # Re-raise other HTTP errors + raise def extract_json(soup, json_var): """ Extract a JSON variable from understat HTML. """ - node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string] - + if soup is None: + raise ValueError(f"Cannot extract {json_var}: page not found (404)") + + matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string] + + if not matching_scripts: + raise ValueError( + f"Cannot find variable '{json_var}' in page HTML. " + "Understat may have changed their website structure to load data dynamically. " + "You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load." + ) + + node = matching_scripts[0] + # Clean string by removing and newlines (\n) and tabs (\t) node_string = ' '.join(node.string.split()) - - json_value = re.match(f"var {json_var} = JSON\.parse\(\'(?P.*?)\'\)", node_string).group('json') + + match = re.match(f"var {json_var} = JSON\.parse\(\'(?P.*?)\'\)", node_string) + if not match: + raise ValueError(f"Could not parse {json_var} from script tag. Pattern may have changed.") + + json_value = match.group('json') return json.loads(json_value) # Cell @@ -57,11 +82,59 @@ def __init__(self, base_url: str='https://understat.com'): def matches(self, league: League, season: int): """ Fetch match data for a given `league` and `season` (start year). """ - league_url = f'{self.base_url}/league/{league.value}/{season}' - soup = fetch_html(league_url) - return extract_json(soup, 'datesData') + # Use the API endpoint directly instead of parsing HTML + api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}' + + # Headers required for the API to work + headers = { + 'Referer': f'{self.base_url}/league/{league.value}/{season}', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + } + + try: + r = requests.get(api_url, headers=headers, timeout=30) + r.raise_for_status() + data = r.json() + # Return the dates data (matches) + return data.get('dates', []) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + # Fallback to old method for backwards compatibility + league_url = f'{self.base_url}/league/{league.value}/{season}' + soup = fetch_html(league_url) + return extract_json(soup, 'datesData') + else: + raise def shots(self, match_id: int): - match_url = f'{self.base_url}/match/{match_id}' - soup = fetch_html(match_url) - return extract_json(soup, 'shotsData') \ No newline at end of file + """ Fetch shot data for a given `match_id`. """ + # Use the API endpoint directly instead of parsing HTML + api_url = f'{self.base_url}/getMatchData/{match_id}' + + # Headers required for the API to work + headers = { + 'Referer': f'{self.base_url}/match/{match_id}', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + } + + try: + r = requests.get(api_url, headers=headers, timeout=30) + r.raise_for_status() + data = r.json() + # Return the shots data + return data.get('shots', {"h": [], "a": []}) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + # Match doesn't exist - return empty shots structure + return {"h": [], "a": []} + else: + # Fallback to old method for backwards compatibility + match_url = f'{self.base_url}/match/{match_id}' + soup = fetch_html(match_url) + if soup is None: + return {"h": [], "a": []} + return extract_json(soup, 'shotsData') \ No newline at end of file