From 3cb13a3d1c3413ee785da3f201474fb9d3976d75 Mon Sep 17 00:00:00 2001
From: JurekBauer <jurek.bauer@mail.de>
Date: Tue, 9 Dec 2025 21:23:24 +0100
Subject: [PATCH] changes to understat; use api

---
 nbs/understat.ipynb      |  97 ++++++++++++++++++++++++++++++++-----
 understatdb/understat.py | 101 +++++++++++++++++++++++++++++++++------
 2 files changed, 172 insertions(+), 26 deletions(-)
diff --git a/nbs/understat.ipynb b/nbs/understat.ipynb
index 5e644ed..7c96e05 100644
--- a/nbs/understat.ipynb
+++ b/nbs/understat.ipynb
@@ -57,21 +57,46 @@
     "\n",
     "def fetch_html(url):\n",
     "    \"\"\"\n",
-    "    Fetch HTML and decode into a `bs4.BeautifulSoup` object\n",
+    "    Fetch HTML and decode into a `bs4.BeautifulSoup` object.\n",
+    "    Returns None if the page returns a 404 error (page not found).\n",
     "    \"\"\"\n",
-    "    r = requests.get(url)\n",
-    "    r.raise_for_status()\n",
-    "    return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n",
+    "    try:\n",
+    "        r = requests.get(url)\n",
+    "        r.raise_for_status()\n",
+    "        return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n",
+    "    except requests.exceptions.HTTPError as e:\n",
+    "        if e.response.status_code == 404:\n",
+    "            # Return None for 404 errors (page not found)\n",
+    "            return None\n",
+    "        else:\n",
+    "            # Re-raise other HTTP errors\n",
+    "            raise\n",
     "    \n",
     "    \n",
     "def extract_json(soup, json_var):\n",
     "    \"\"\" Extract a JSON variable from understat HTML. \"\"\"\n",
-    "    node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]\n",
+    "    if soup is None:\n",
+    "        raise ValueError(f\"Cannot extract {json_var}: page not found (404)\")\n",
+    "    \n",
+    "    matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string]\n",
+    "    \n",
+    "    if not matching_scripts:\n",
+    "        raise ValueError(\n",
+    "            f\"Cannot find variable '{json_var}' in page HTML. \"\n",
+    "            \"Understat may have changed their website structure to load data dynamically. \"\n",
+    "            \"You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load.\"\n",
+    "        )\n",
+    "    \n",
+    "    node = matching_scripts[0]\n",
     "    \n",
     "    # Clean string by removing and newlines (\\n) and tabs (\\t)\n",
     "    node_string = ' '.join(node.string.split())\n",
     "    \n",
-    "    json_value = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P<json>.*?)\\'\\)\", node_string).group('json')\n",
+    "    match = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P<json>.*?)\\'\\)\", node_string)\n",
+    "    if not match:\n",
+    "        raise ValueError(f\"Could not parse {json_var} from script tag. Pattern may have changed.\")\n",
+    "    \n",
+    "    json_value = match.group('json')\n",
     "    return json.loads(json_value)"
    ]
   },
@@ -125,14 +150,62 @@
     "        \n",
     "    def matches(self, league: League, season: int):\n",
     "        \"\"\" Fetch match data for a given `league` and `season` (start year). \"\"\"\n",
-    "        league_url = f'{self.base_url}/league/{league.value}/{season}'\n",
-    "        soup = fetch_html(league_url)\n",
-    "        return extract_json(soup, 'datesData')\n",
+    "        # Use the API endpoint directly instead of parsing HTML\n",
+    "        api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}'\n",
+    "        \n",
+    "        # Headers required for the API to work\n",
+    "        headers = {\n",
+    "            'Referer': f'{self.base_url}/league/{league.value}/{season}',\n",
+    "            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n",
+    "            'Accept': 'application/json, text/javascript, */*; q=0.01',\n",
+    "            'X-Requested-With': 'XMLHttpRequest',\n",
+    "        }\n",
+    "        \n",
+    "        try:\n",
+    "            r = requests.get(api_url, headers=headers, timeout=30)\n",
+    "            r.raise_for_status()\n",
+    "            data = r.json()\n",
+    "            # Return the dates data (matches)\n",
+    "            return data.get('dates', [])\n",
+    "        except requests.exceptions.HTTPError as e:\n",
+    "            if e.response.status_code == 404:\n",
+    "                # Fallback to old method for backwards compatibility\n",
+    "                league_url = f'{self.base_url}/league/{league.value}/{season}'\n",
+    "                soup = fetch_html(league_url)\n",
+    "                return extract_json(soup, 'datesData')\n",
+    "            else:\n",
+    "                raise\n",
     "    \n",
     "    def shots(self, match_id: int):\n",
-    "        match_url = f'{self.base_url}/match/{match_id}'\n",
-    "        soup = fetch_html(match_url)\n",
-    "        return extract_json(soup, 'shotsData')"
+    "        \"\"\" Fetch shot data for a given `match_id`. \"\"\"\n",
+    "        # Use the API endpoint directly instead of parsing HTML\n",
+    "        api_url = f'{self.base_url}/getMatchData/{match_id}'\n",
+    "        \n",
+    "        # Headers required for the API to work\n",
+    "        headers = {\n",
+    "            'Referer': f'{self.base_url}/match/{match_id}',\n",
+    "            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n",
+    "            'Accept': 'application/json, text/javascript, */*; q=0.01',\n",
+    "            'X-Requested-With': 'XMLHttpRequest',\n",
+    "        }\n",
+    "        \n",
+    "        try:\n",
+    "            r = requests.get(api_url, headers=headers, timeout=30)\n",
+    "            r.raise_for_status()\n",
+    "            data = r.json()\n",
+    "            # Return the shots data\n",
+    "            return data.get('shots', {\"h\": [], \"a\": []})\n",
+    "        except requests.exceptions.HTTPError as e:\n",
+    "            if e.response.status_code == 404:\n",
+    "                # Match doesn't exist - return empty shots structure\n",
+    "                return {\"h\": [], \"a\": []}\n",
+    "            else:\n",
+    "                # Fallback to old method for backwards compatibility\n",
+    "                match_url = f'{self.base_url}/match/{match_id}'\n",
+    "                soup = fetch_html(match_url)\n",
+    "                if soup is None:\n",
+    "                    return {\"h\": [], \"a\": []}\n",
+    "                return extract_json(soup, 'shotsData')"
    ]
   },
   {
diff --git a/understatdb/understat.py b/understatdb/understat.py
index 14af925..9eb5bd9 100644
--- a/understatdb/understat.py
+++ b/understatdb/understat.py
@@ -13,21 +13,46 @@
 
 def fetch_html(url):
     """
-    Fetch HTML and decode into a `bs4.BeautifulSoup` object
+    Fetch HTML and decode into a `bs4.BeautifulSoup` object.
+    Returns None if the page returns a 404 error (page not found).
     """
-    r = requests.get(url)
-    r.raise_for_status()
-    return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')
+    try:
+        r = requests.get(url)
+        r.raise_for_status()
+        return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            # Return None for 404 errors (page not found)
+            return None
+        else:
+            # Re-raise other HTTP errors
+            raise
 
 
 def extract_json(soup, json_var):
     """ Extract a JSON variable from understat HTML. """
-    node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]
-
+    if soup is None:
+        raise ValueError(f"Cannot extract {json_var}: page not found (404)")
+    
+    matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string]
+    
+    if not matching_scripts:
+        raise ValueError(
+            f"Cannot find variable '{json_var}' in page HTML. "
+            "Understat may have changed their website structure to load data dynamically. "
+            "You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load."
+        )
+    
+    node = matching_scripts[0]
+    
     # Clean string by removing and newlines (\n) and tabs (\t)
     node_string = ' '.join(node.string.split())
-
-    json_value = re.match(f"var {json_var} = JSON\.parse\(\'(?P<json>.*?)\'\)", node_string).group('json')
+    
+    match = re.match(f"var {json_var} = JSON\.parse\(\'(?P<json>.*?)\'\)", node_string)
+    if not match:
+        raise ValueError(f"Could not parse {json_var} from script tag. Pattern may have changed.")
+    
+    json_value = match.group('json')
     return json.loads(json_value)
 
 # Cell
@@ -57,11 +82,59 @@ def __init__(self, base_url: str='https://understat.com'):
 
     def matches(self, league: League, season: int):
         """ Fetch match data for a given `league` and `season` (start year). """
-        league_url = f'{self.base_url}/league/{league.value}/{season}'
-        soup = fetch_html(league_url)
-        return extract_json(soup, 'datesData')
+        # Use the API endpoint directly instead of parsing HTML
+        api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}'
+        
+        # Headers required for the API to work
+        headers = {
+            'Referer': f'{self.base_url}/league/{league.value}/{season}',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'X-Requested-With': 'XMLHttpRequest',
+        }
+        
+        try:
+            r = requests.get(api_url, headers=headers, timeout=30)
+            r.raise_for_status()
+            data = r.json()
+            # Return the dates data (matches)
+            return data.get('dates', [])
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 404:
+                # Fallback to old method for backwards compatibility
+                league_url = f'{self.base_url}/league/{league.value}/{season}'
+                soup = fetch_html(league_url)
+                return extract_json(soup, 'datesData')
+            else:
+                raise
 
     def shots(self, match_id: int):
-        match_url = f'{self.base_url}/match/{match_id}'
-        soup = fetch_html(match_url)
-        return extract_json(soup, 'shotsData')
\ No newline at end of file
+        """ Fetch shot data for a given `match_id`. """
+        # Use the API endpoint directly instead of parsing HTML
+        api_url = f'{self.base_url}/getMatchData/{match_id}'
+        
+        # Headers required for the API to work
+        headers = {
+            'Referer': f'{self.base_url}/match/{match_id}',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'X-Requested-With': 'XMLHttpRequest',
+        }
+        
+        try:
+            r = requests.get(api_url, headers=headers, timeout=30)
+            r.raise_for_status()
+            data = r.json()
+            # Return the shots data
+            return data.get('shots', {"h": [], "a": []})
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 404:
+                # Match doesn't exist - return empty shots structure
+                return {"h": [], "a": []}
+            else:
+                # Fallback to old method for backwards compatibility
+                match_url = f'{self.base_url}/match/{match_id}'
+                soup = fetch_html(match_url)
+                if soup is None:
+                    return {"h": [], "a": []}
+                return extract_json(soup, 'shotsData')
\ No newline at end of file