diff --git a/.gitignore b/.gitignore index f582195..b9e02ae 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ pip-log.txt build dist MANIFEST + +__pycache__/ +*.pyc diff --git a/test_urlnorm.py b/test_urlnorm.py index e1f7916..99c25a5 100644 --- a/test_urlnorm.py +++ b/test_urlnorm.py @@ -23,7 +23,7 @@ def pytest_generate_tests(metafunc): 'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character # note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign # 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order - + # 'http://s.xn--q-bga.de/': 'http://s.q\xc3\xa9.de/'.decode('utf8'), # should be in idna format 'http://test.example/?': 'http://test.example/', # no trailing ? 'http://test.example?': 'http://test.example/', # with trailing / @@ -36,7 +36,7 @@ def pytest_generate_tests(metafunc): # u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'), 'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'), 'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad', - + # check that %23 (#) is not escaped where it shouldn't be 'http://test.example/?p=%23val#test-%23-val%25': 'http://test.example/?p=%23val#test-%23-val%25', # check that %20 or %25 is not unescaped to ' ' or % @@ -49,13 +49,16 @@ def pytest_generate_tests(metafunc): "http://[::ffff:192.168.1.1]:80/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation "htTps://[::fFff:192.168.1.1]:443/test" : "https://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation - # python 2.5 urlparse doesn't handle unknown protocols, so skipping this for now - #"itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms:// + 'http://localhost/': 'http://localhost/', + 'http://localhost:8080/': 'http://localhost:8080/', + 'homefeedapps://pinterest/': 'homefeedapps://pinterest/', # can handle Android deep link + 'mailto:me@pinterest.com': 'mailto:me@pinterest.com', # can handle mailto: + "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms:// } for bad, good in tests.items(): metafunc.addcall(funcargs=dict(bad=bad, good=good)) - + elif metafunc.function == test_unquote: for bad, good, unsafe in ( ('%20', ' ', ''), @@ -63,12 +66,12 @@ def pytest_generate_tests(metafunc): ('%E3%82%AD', u'\u30ad', ''), ): metafunc.addcall(funcargs=dict(bad=bad, good=good, unsafe=unsafe)) - + elif metafunc.function in [test_invalid_urls]: for url in [ - 'http://http://www.exemple.com/', # invalid domain '-', 'asdf', + 'http://./', 'HTTP://4294967297/test', # one more than max ip > int 'http://[img]http://i790.photobucket.com/albums/yy185/zack-32009/jordan.jpg[/IMG]', ]: diff --git a/urlnorm.py b/urlnorm.py index 481a1d6..633c516 100644 --- a/urlnorm.py +++ b/urlnorm.py @@ -13,7 +13,7 @@ * unescaping any percent escape sequences (where possible) * upercase percent escape (ie: %3f => %3F) * converts spaces to %20 - * converts ip encoded as an integer to dotted quad notation + * converts ip encoded as an integer to dotted quad notation Available functions: norm - given a URL (string), returns a normalized URL @@ -36,7 +36,7 @@ 0.92 - unknown schemes now pass the port through silently 0.91 - general cleanup - changed dictionaries to lists where appropriate - - more fine-grained authority parsing and normalisation + - more fine-grained authority parsing and normalisation """ __license__ = """ @@ -156,12 +156,12 @@ def norm_tuple(scheme, authority, path, parameters, query, fragment): if not scheme: raise InvalidUrl('missing URL scheme') authority = norm_netloc(scheme, authority) - if not authority: + if not authority and scheme != 'mailto': raise InvalidUrl('missing netloc') path = norm_path(scheme, path) # TODO: put query in sorted order; or at least group parameters together # Note that some websites use positional parameters or the name part of a query so this would break the internet - # query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1) + # query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1) parameters = unquote_params(parameters) query = unquote_qs(query) fragment = unquote_fragment(fragment) @@ -190,14 +190,14 @@ def int2ip(ipnum): ip3 = ipnum >> 8 & 0xFF ip4 = ipnum & 0xFF return "%d.%d.%d.%d" % (ip1, ip2, ip3, ip4) - + def norm_netloc(scheme, netloc): if not netloc: return netloc match = _server_authority.match(netloc) if not match: raise InvalidUrl('no host in netloc %r' % netloc) - + userinfo, host, port = match.groups() # catch a few common errors: if host.isdigit(): @@ -207,16 +207,16 @@ def norm_netloc(scheme, netloc): raise InvalidUrl('host %r does not escape to a valid ip' % host) if host[-1] == '.': host = host[:-1] - + # bracket check is for ipv6 hosts - if '.' not in host and not (host[0] == '[' and host[-1] == ']'): + if not host or ('.' not in host and not (host[0] == '[' and host[-1] == ']')): raise InvalidUrl('host %r is not valid' % host) - + authority = lower(host) if 'xn--' in authority: subdomains = [_idn(subdomain) for subdomain in authority.split('.')] authority = '.'.join(subdomains) - + if userinfo: authority = "%s@%s" % (userinfo, authority) if port and port != _default_port.get(scheme, None):