Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions data_url/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
import base64

DATA_URL_RE = re.compile(
r"data:(?P<MIME>([\w-]+\/[\w+\.-]+(;[\w-]+\=[\w-]+)?)?)(?P<encoded>;base64)?,(?P<data>[\w\d.~%\=\/\+-]+)"
r"""
data: # literal data:
(?P<MIME>[a-z][a-z0-9\-]+/[a-z][\w\-\.\+]+)? # optional media type
(?P<parameters>(?:;[\w\-\.+]+=[\w\-\.+%]+)*) # optional attribute=values, value can be url encoded
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Parameter values are also allowed to be the quoted-string token defined in RFC 822 so if we are going to add parameterization here we should probably accept those values as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going off of https://www.rfc-editor.org/rfc/rfc2397.html, didn't see quoted strings there. Do you think it could still be a useful advance without them? It seems like a fringe case of a fringe case to me.

Personally, I'm using the parameters functionality to store a filename for the encoded data... and possibly the charset may be useful in future...

If you can, please look at the follow up PR as (sic) this one doesn't exist in isolation.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with that for now, just something to keep in mind. That RFC imports the "value" token from RFC 2045 which defines a token as a quotable string.

(?P<encoded>;base64)?, # optional base64 flag
(?P<data>[\w\d.~%\=\/\+-]+) # the data
""",
re.MULTILINE | re.VERBOSE
)


def construct_data_url(mime_type, base64_encoded, data):
"""
Helper method for just creating a data URL from some data. If this
Expand Down Expand Up @@ -47,8 +55,9 @@ def from_url(cls, url):
"""
data_url = cls()
data_url._url = url
data_url.__parse_url()
return data_url
if data_url.__parse_url():
return data_url
return None

@classmethod
def from_data(cls, mime_type, base64_encoded, data):
Expand Down Expand Up @@ -107,13 +116,16 @@ def __parse_url(self):
"""Parses a data URL to get each individual element and sets the
respecting class attributes."""
match = DATA_URL_RE.fullmatch(self._url)
self._is_base64_encoded = match.group('encoded') is not None
self._mime_type = match.group("MIME")
raw_data = match.group('data')
if self._is_base64_encoded:
self._data = base64.b64decode(raw_data)
else:
self._data = raw_data
if match:
self._is_base64_encoded = match.group('encoded') is not None
self._mime_type = match.group("MIME") or ""
raw_data = match.group('data')
if self._is_base64_encoded:
self._data = base64.b64decode(raw_data)
else:
self._data = raw_data
return True
return False

def __construct_url(self):
"""Constructs an actual data URL string from class attributes."""
Expand Down
4 changes: 4 additions & 0 deletions test/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def test_construct_data_url(self):
self.assertEqual(raw_data, deconstructed_url.data)
self.assertEqual(data, deconstructed_url.encoded_data)

def test_non_compliant_url(self):
url = DataURL.from_url("not a url")
assert url is None

class TestFromData(unittest.TestCase):
def test_typing(self):
with self.assertRaises(Exception) as context:
Expand Down