From 5c3c6c67823f53f636df7a929807f67181d4aaed Mon Sep 17 00:00:00 2001 From: Mark Southern Date: Tue, 3 Jun 2025 17:24:53 -0700 Subject: [PATCH 1/2] updated regex and url parsing --- data_url/__init__.py | 34 +++++++++++++++++++++++----------- test/test_url.py | 4 ++++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/data_url/__init__.py b/data_url/__init__.py index a8617e8..61e25ce 100644 --- a/data_url/__init__.py +++ b/data_url/__init__.py @@ -2,9 +2,17 @@ import base64 DATA_URL_RE = re.compile( - r"data:(?P([\w-]+\/[\w+\.-]+(;[\w-]+\=[\w-]+)?)?)(?P;base64)?,(?P[\w\d.~%\=\/\+-]+)" + r""" + data: # literal data: + (?P[\w\-\.+]+/[\w\-\.+]+)? # optional media type + (?P(?:;[\w\-\.+]+=[\w\-\.+%]+)*) # optional attribute=values, value can be url encoded + (?P;base64)?, # optional base64 flag + (?P[\w\d.~%\=\/\+-]+) # the data + """, + re.MULTILINE | re.VERBOSE ) + def construct_data_url(mime_type, base64_encoded, data): """ Helper method for just creating a data URL from some data. If this @@ -47,8 +55,9 @@ def from_url(cls, url): """ data_url = cls() data_url._url = url - data_url.__parse_url() - return data_url + if data_url.__parse_url(): + return data_url + return None @classmethod def from_data(cls, mime_type, base64_encoded, data): @@ -106,14 +115,17 @@ def from_byte_data(cls, mime_type, data): def __parse_url(self): """Parses a data URL to get each individual element and sets the respecting class attributes.""" - match = DATA_URL_RE.fullmatch(self._url) - self._is_base64_encoded = match.group('encoded') is not None - self._mime_type = match.group("MIME") - raw_data = match.group('data') - if self._is_base64_encoded: - self._data = base64.b64decode(raw_data) - else: - self._data = raw_data + match = DATA_URL_RE.search(self._url) + if match: + self._is_base64_encoded = match.group('encoded') is not None + self._mime_type = match.group("MIME") or "" + raw_data = match.group('data') + if self._is_base64_encoded: + self._data = base64.b64decode(raw_data) + else: + self._data = raw_data + return True + return False def __construct_url(self): """Constructs an actual data URL string from class attributes.""" diff --git a/test/test_url.py b/test/test_url.py index f9635d5..43110ed 100644 --- a/test/test_url.py +++ b/test/test_url.py @@ -60,6 +60,10 @@ def test_construct_data_url(self): self.assertEqual(raw_data, deconstructed_url.data) self.assertEqual(data, deconstructed_url.encoded_data) + def test_non_compliant_url(self): + url = DataURL.from_url("not a url") + assert url is None + class TestFromData(unittest.TestCase): def test_typing(self): with self.assertRaises(Exception) as context: From 2a93a001912020577ec8376c7f638d8f4ed4dd78 Mon Sep 17 00:00:00 2001 From: Mark Southern Date: Thu, 12 Jun 2025 21:27:57 -0700 Subject: [PATCH 2/2] more explicit mimetype type and subtype, revert re.search to re.fullmatch --- data_url/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_url/__init__.py b/data_url/__init__.py index 61e25ce..df2f304 100644 --- a/data_url/__init__.py +++ b/data_url/__init__.py @@ -4,7 +4,7 @@ DATA_URL_RE = re.compile( r""" data: # literal data: - (?P[\w\-\.+]+/[\w\-\.+]+)? # optional media type + (?P[a-z][a-z0-9\-]+/[a-z][\w\-\.\+]+)? # optional media type (?P(?:;[\w\-\.+]+=[\w\-\.+%]+)*) # optional attribute=values, value can be url encoded (?P;base64)?, # optional base64 flag (?P[\w\d.~%\=\/\+-]+) # the data @@ -115,7 +115,7 @@ def from_byte_data(cls, mime_type, data): def __parse_url(self): """Parses a data URL to get each individual element and sets the respecting class attributes.""" - match = DATA_URL_RE.search(self._url) + match = DATA_URL_RE.fullmatch(self._url) if match: self._is_base64_encoded = match.group('encoded') is not None self._mime_type = match.group("MIME") or ""