diff --git a/.gitignore b/.gitignore index b9e02ae..1b692ae 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ pip-log.txt build dist MANIFEST - -__pycache__/ *.pyc +.pytest_cache/ +.tox/ +__pycache__/ diff --git a/setup.py b/setup.py index 5d0672d..3b9915a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from distutils.core import setup # also update in urlnorm.py -version = '1.1.2.pinterest3' +version = '1.1.2.pinterest4' setup(name='urlnorm', version=version, @@ -9,8 +9,15 @@ description="Normalize a URL to a standard unicode encoding", py_modules=['urlnorm'], license='MIT License', + install_requires=['six'], author='Jehiah Czebotar', author_email='jehiah@gmail.com', url='http://github.com/jehiah/urlnorm', download_url="http://github.com/downloads/jehiah/urlnorm/urlnorm-%s.tar.gz" % version, + classifiers=[ + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + ], ) diff --git a/test_urlnorm.py b/test_urlnorm.py index 99c25a5..e4baff3 100644 --- a/test_urlnorm.py +++ b/test_urlnorm.py @@ -1,6 +1,8 @@ +# -*- coding: utf8 -*- """ this is a py.test test file """ +from __future__ import print_function import urlnorm from urlnorm import _unicode @@ -20,7 +22,7 @@ def pytest_generate_tests(metafunc): 'http://USER:pass@www.Example.COM/foo/bar': 'http://USER:pass@www.example.com/foo/bar', 'http://www.example.com./': 'http://www.example.com/', 'http://test.example/?a=%26&b=1': 'http://test.example/?a=%26&b=1', # should not un-encode the & that is part of a parameter value - 'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character + 'http://test.example/?a=%e3%82%82%26': u'http://test.example/?a=\u3082%26', # should return a unicode character # note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign # 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order @@ -29,12 +31,12 @@ def pytest_generate_tests(metafunc): 'http://test.example?': 'http://test.example/', # with trailing / 'http://a.COM/path/?b&a' : 'http://a.com/path/?b&a', # test utf8 and unicode - u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), - u'http://XBLA\u306eXbox.com'.encode('utf8'): 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), - u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), + u'http://XBLA\u306eXbox.com': u'http://xbla\u306exbox.com/', + u'http://XBLA\u306eXbox.com'.encode('utf8'): u'http://xbla\u306exbox.com/', + u'http://XBLA\u306eXbox.com': u'http://xbla\u306exbox.com/', # test idna + utf8 domain # u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'), - 'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'), + 'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': u'http://ja.wikipedia.org/wiki/\u30ad\u30e3\u30bf\u30d4\u30e9\u30fc\u30b8\u30e3\u30d1\u30f3', 'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad', # check that %23 (#) is not escaped where it shouldn't be @@ -42,6 +44,7 @@ def pytest_generate_tests(metafunc): # check that %20 or %25 is not unescaped to ' ' or % 'http://test.example/%25/?p=%20val%20%25' : 'http://test.example/%25/?p=%20val%20%25', "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n", + "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : u"http://test.domain/Iñtërnâtiôn�lizætiøn", # check that spaces are collated to '+' "http://test.example/path/with a%20space+/" : "http://test.example/path/with%20a%20space+/", "http://[2001:db8:1f70::999:de8:7648:6e8]/test" : "http://[2001:db8:1f70::999:de8:7648:6e8]/test", #ipv6 address @@ -107,7 +110,7 @@ def pytest_generate_tests(metafunc): def test_invalid_urls(url): try: output = urlnorm.norm(url) - print '%r' % output + print('%r' % output) except urlnorm.InvalidUrl: return assert 1 == 0, "this should have raised an InvalidUrl exception" diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..f53598a --- /dev/null +++ b/tox.ini @@ -0,0 +1,6 @@ +[tox] +envlist=py27,py3 + +[testenv] +deps = pytest +commands = pytest test_urlnorm.py diff --git a/urlnorm.py b/urlnorm.py index e8ce93d..fe8b0bd 100644 --- a/urlnorm.py +++ b/urlnorm.py @@ -39,6 +39,10 @@ - more fine-grained authority parsing and normalisation """ +from __future__ import absolute_import +from six import unichr +import six +from six.moves import range __license__ = """ Copyright (c) 1999-2002 Mark Nottingham Copyright (c) 2010 Jehiah Czebotar @@ -63,10 +67,9 @@ """ # also update in setup.py -__version__ = "1.1.2.pinterest2" +__version__ = "1.1.2.pinterest4" -from urlparse import urlparse, urlunparse -from string import lower +from six.moves.urllib.parse import urlparse, urlunparse, unquote import re class InvalidUrl(Exception): @@ -105,8 +108,8 @@ class InvalidUrl(Exception): qs_unsafe_list = ' ?&=+%#' fragment_unsafe_list = ' +%#' path_unsafe_list = ' /?;%+#' -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +_hextochr = dict((b'%02x' % i, six.int2byte(i)) for i in range(256)) +_hextochr.update((b'%02X' % i, six.int2byte(i)) for i in range(256)) def unquote_path(s): return unquote_safe(s, path_unsafe_list) @@ -124,22 +127,23 @@ def unquote_safe(s, unsafe_list): """unquote percent escaped string except for percent escape sequences that are in unsafe_list""" # note: this build utf8 raw strings ,then does a .decode('utf8') at the end. # as a result it's doing .encode('utf8') on each block of the string as it's processed. - res = _utf8(s).split('%') - for i in xrange(1, len(res)): + unsafe_list = [_utf8(i) for i in unsafe_list] + res = _utf8(s).split(b'%') + for i in range(1, len(res)): item = res[i] try: raw_chr = _hextochr[item[:2]] if raw_chr in unsafe_list or ord(raw_chr) < 20: # leave it unescaped (but uppercase the percent escape) - res[i] = '%' + item[:2].upper() + item[2:] + res[i] = b'%' + item[:2].upper() + item[2:] else: res[i] = raw_chr + item[2:] except KeyError: - res[i] = '%' + item + res[i] = b'%' + item except UnicodeDecodeError: # note: i'm not sure what this does res[i] = unichr(int(item[:2], 16)) + item[2:] - o = "".join(res) + o = b"".join(res) return _unicode(o) def norm(url): @@ -151,7 +155,7 @@ def norm(url): def norm_tuple(scheme, authority, path, parameters, query, fragment): """given individual url components, return its normalized form""" - scheme = lower(scheme) + scheme = scheme.lower() if not scheme: raise InvalidUrl('missing URL scheme') authority = norm_netloc(scheme, authority) @@ -193,7 +197,7 @@ def norm_path(scheme, path): return '/' return path -MAX_IP=0xffffffffL +MAX_IP=0xffffffff def int2ip(ipnum): assert isinstance(ipnum, int) if MAX_IP < ipnum or ipnum < 0: @@ -221,7 +225,7 @@ def norm_netloc(scheme, netloc): if host[-1] == '.': host = host[:-1] - authority = lower(host) + authority = host.lower() if 'xn--' in authority: subdomains = [_idn(subdomain) for subdomain in authority.split('.')] authority = '.'.join(subdomains) @@ -243,14 +247,14 @@ def _idn(subdomain): def _utf8(value): - if isinstance(value, unicode): + if isinstance(value, six.text_type): return value.encode("utf-8") assert isinstance(value, str) return value def _unicode(value): - if isinstance(value, str): + if isinstance(value, six.binary_type): return value.decode("utf-8") - assert isinstance(value, unicode) + assert isinstance(value, six.text_type) return value