Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 85 additions & 12 deletions nbs/understat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,46 @@
"\n",
"def fetch_html(url):\n",
" \"\"\"\n",
" Fetch HTML and decode into a `bs4.BeautifulSoup` object\n",
" Fetch HTML and decode into a `bs4.BeautifulSoup` object.\n",
" Returns None if the page returns a 404 error (page not found).\n",
" \"\"\"\n",
" r = requests.get(url)\n",
" r.raise_for_status()\n",
" return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n",
" try:\n",
" r = requests.get(url)\n",
" r.raise_for_status()\n",
" return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')\n",
" except requests.exceptions.HTTPError as e:\n",
" if e.response.status_code == 404:\n",
" # Return None for 404 errors (page not found)\n",
" return None\n",
" else:\n",
" # Re-raise other HTTP errors\n",
" raise\n",
" \n",
" \n",
"def extract_json(soup, json_var):\n",
" \"\"\" Extract a JSON variable from understat HTML. \"\"\"\n",
" node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]\n",
" if soup is None:\n",
" raise ValueError(f\"Cannot extract {json_var}: page not found (404)\")\n",
" \n",
" matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string]\n",
" \n",
" if not matching_scripts:\n",
" raise ValueError(\n",
" f\"Cannot find variable '{json_var}' in page HTML. \"\n",
" \"Understat may have changed their website structure to load data dynamically. \"\n",
" \"You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load.\"\n",
" )\n",
" \n",
" node = matching_scripts[0]\n",
" \n",
" # Clean string by removing and newlines (\\n) and tabs (\\t)\n",
" node_string = ' '.join(node.string.split())\n",
" \n",
" json_value = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P<json>.*?)\\'\\)\", node_string).group('json')\n",
" match = re.match(f\"var {json_var} = JSON\\.parse\\(\\'(?P<json>.*?)\\'\\)\", node_string)\n",
" if not match:\n",
" raise ValueError(f\"Could not parse {json_var} from script tag. Pattern may have changed.\")\n",
" \n",
" json_value = match.group('json')\n",
" return json.loads(json_value)"
]
},
Expand Down Expand Up @@ -125,14 +150,62 @@
" \n",
" def matches(self, league: League, season: int):\n",
" \"\"\" Fetch match data for a given `league` and `season` (start year). \"\"\"\n",
" league_url = f'{self.base_url}/league/{league.value}/{season}'\n",
" soup = fetch_html(league_url)\n",
" return extract_json(soup, 'datesData')\n",
" # Use the API endpoint directly instead of parsing HTML\n",
" api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}'\n",
" \n",
" # Headers required for the API to work\n",
" headers = {\n",
" 'Referer': f'{self.base_url}/league/{league.value}/{season}',\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n",
" 'Accept': 'application/json, text/javascript, */*; q=0.01',\n",
" 'X-Requested-With': 'XMLHttpRequest',\n",
" }\n",
" \n",
" try:\n",
" r = requests.get(api_url, headers=headers, timeout=30)\n",
" r.raise_for_status()\n",
" data = r.json()\n",
" # Return the dates data (matches)\n",
" return data.get('dates', [])\n",
" except requests.exceptions.HTTPError as e:\n",
" if e.response.status_code == 404:\n",
" # Fallback to old method for backwards compatibility\n",
" league_url = f'{self.base_url}/league/{league.value}/{season}'\n",
" soup = fetch_html(league_url)\n",
" return extract_json(soup, 'datesData')\n",
" else:\n",
" raise\n",
" \n",
" def shots(self, match_id: int):\n",
" match_url = f'{self.base_url}/match/{match_id}'\n",
" soup = fetch_html(match_url)\n",
" return extract_json(soup, 'shotsData')"
" \"\"\" Fetch shot data for a given `match_id`. \"\"\"\n",
" # Use the API endpoint directly instead of parsing HTML\n",
" api_url = f'{self.base_url}/getMatchData/{match_id}'\n",
" \n",
" # Headers required for the API to work\n",
" headers = {\n",
" 'Referer': f'{self.base_url}/match/{match_id}',\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',\n",
" 'Accept': 'application/json, text/javascript, */*; q=0.01',\n",
" 'X-Requested-With': 'XMLHttpRequest',\n",
" }\n",
" \n",
" try:\n",
" r = requests.get(api_url, headers=headers, timeout=30)\n",
" r.raise_for_status()\n",
" data = r.json()\n",
" # Return the shots data\n",
" return data.get('shots', {\"h\": [], \"a\": []})\n",
" except requests.exceptions.HTTPError as e:\n",
" if e.response.status_code == 404:\n",
" # Match doesn't exist - return empty shots structure\n",
" return {\"h\": [], \"a\": []}\n",
" else:\n",
" # Fallback to old method for backwards compatibility\n",
" match_url = f'{self.base_url}/match/{match_id}'\n",
" soup = fetch_html(match_url)\n",
" if soup is None:\n",
" return {\"h\": [], \"a\": []}\n",
" return extract_json(soup, 'shotsData')"
]
},
{
Expand Down
101 changes: 87 additions & 14 deletions understatdb/understat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,46 @@

def fetch_html(url):
"""
Fetch HTML and decode into a `bs4.BeautifulSoup` object
Fetch HTML and decode into a `bs4.BeautifulSoup` object.
Returns None if the page returns a 404 error (page not found).
"""
r = requests.get(url)
r.raise_for_status()
return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')
try:
r = requests.get(url)
r.raise_for_status()
return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser')
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
# Return None for 404 errors (page not found)
return None
else:
# Re-raise other HTTP errors
raise


def extract_json(soup, json_var):
""" Extract a JSON variable from understat HTML. """
node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string]

if soup is None:
raise ValueError(f"Cannot extract {json_var}: page not found (404)")

matching_scripts = [s for s in soup.select('script') if s.string and json_var in s.string]

if not matching_scripts:
raise ValueError(
f"Cannot find variable '{json_var}' in page HTML. "
"Understat may have changed their website structure to load data dynamically. "
"You may need to use a headless browser (e.g., Selenium) to wait for JavaScript to load."
)

node = matching_scripts[0]

# Clean string by removing and newlines (\n) and tabs (\t)
node_string = ' '.join(node.string.split())

json_value = re.match(f"var {json_var} = JSON\.parse\(\'(?P<json>.*?)\'\)", node_string).group('json')

match = re.match(f"var {json_var} = JSON\.parse\(\'(?P<json>.*?)\'\)", node_string)
if not match:
raise ValueError(f"Could not parse {json_var} from script tag. Pattern may have changed.")

json_value = match.group('json')
return json.loads(json_value)

# Cell
Expand Down Expand Up @@ -57,11 +82,59 @@ def __init__(self, base_url: str='https://understat.com'):

def matches(self, league: League, season: int):
""" Fetch match data for a given `league` and `season` (start year). """
league_url = f'{self.base_url}/league/{league.value}/{season}'
soup = fetch_html(league_url)
return extract_json(soup, 'datesData')
# Use the API endpoint directly instead of parsing HTML
api_url = f'{self.base_url}/getLeagueData/{league.value}/{season}'

# Headers required for the API to work
headers = {
'Referer': f'{self.base_url}/league/{league.value}/{season}',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
}

try:
r = requests.get(api_url, headers=headers, timeout=30)
r.raise_for_status()
data = r.json()
# Return the dates data (matches)
return data.get('dates', [])
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
# Fallback to old method for backwards compatibility
league_url = f'{self.base_url}/league/{league.value}/{season}'
soup = fetch_html(league_url)
return extract_json(soup, 'datesData')
else:
raise

def shots(self, match_id: int):
match_url = f'{self.base_url}/match/{match_id}'
soup = fetch_html(match_url)
return extract_json(soup, 'shotsData')
""" Fetch shot data for a given `match_id`. """
# Use the API endpoint directly instead of parsing HTML
api_url = f'{self.base_url}/getMatchData/{match_id}'

# Headers required for the API to work
headers = {
'Referer': f'{self.base_url}/match/{match_id}',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
}

try:
r = requests.get(api_url, headers=headers, timeout=30)
r.raise_for_status()
data = r.json()
# Return the shots data
return data.get('shots', {"h": [], "a": []})
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
# Match doesn't exist - return empty shots structure
return {"h": [], "a": []}
else:
# Fallback to old method for backwards compatibility
match_url = f'{self.base_url}/match/{match_id}'
soup = fetch_html(match_url)
if soup is None:
return {"h": [], "a": []}
return extract_json(soup, 'shotsData')