Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ pip-log.txt
build
dist
MANIFEST

__pycache__/
*.pyc
17 changes: 10 additions & 7 deletions test_urlnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def pytest_generate_tests(metafunc):
'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character
# note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign
# 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order

# 'http://s.xn--q-bga.de/': 'http://s.q\xc3\xa9.de/'.decode('utf8'), # should be in idna format
'http://test.example/?': 'http://test.example/', # no trailing ?
'http://test.example?': 'http://test.example/', # with trailing /
Expand All @@ -36,7 +36,7 @@ def pytest_generate_tests(metafunc):
# u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'),
'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'),
'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad',

# check that %23 (#) is not escaped where it shouldn't be
'http://test.example/?p=%23val#test-%23-val%25': 'http://test.example/?p=%23val#test-%23-val%25',
# check that %20 or %25 is not unescaped to ' ' or %
Expand All @@ -49,26 +49,29 @@ def pytest_generate_tests(metafunc):
"http://[::ffff:192.168.1.1]:80/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation
"htTps://[::fFff:192.168.1.1]:443/test" : "https://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation

# python 2.5 urlparse doesn't handle unknown protocols, so skipping this for now
#"itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms://
'http://localhost/': 'http://localhost/',
'http://localhost:8080/': 'http://localhost:8080/',
'homefeedapps://pinterest/': 'homefeedapps://pinterest/', # can handle Android deep link
'mailto:me@pinterest.com': 'mailto:me@pinterest.com', # can handle mailto:
"itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms://

}
for bad, good in tests.items():
metafunc.addcall(funcargs=dict(bad=bad, good=good))

elif metafunc.function == test_unquote:
for bad, good, unsafe in (
('%20', ' ', ''),
('%3f', '%3F', '?'), # don't unquote it, but uppercase it
('%E3%82%AD', u'\u30ad', ''),
):
metafunc.addcall(funcargs=dict(bad=bad, good=good, unsafe=unsafe))

elif metafunc.function in [test_invalid_urls]:
for url in [
'http://http://www.exemple.com/', # invalid domain
'-',
'asdf',
'http://./',
'HTTP://4294967297/test', # one more than max ip > int
'http://[img]http://i790.photobucket.com/albums/yy185/zack-32009/jordan.jpg[/IMG]',
]:
Expand Down
20 changes: 10 additions & 10 deletions urlnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* unescaping any percent escape sequences (where possible)
* upercase percent escape (ie: %3f => %3F)
* converts spaces to %20
* converts ip encoded as an integer to dotted quad notation
* converts ip encoded as an integer to dotted quad notation

Available functions:
norm - given a URL (string), returns a normalized URL
Expand All @@ -36,7 +36,7 @@
0.92 - unknown schemes now pass the port through silently
0.91 - general cleanup
- changed dictionaries to lists where appropriate
- more fine-grained authority parsing and normalisation
- more fine-grained authority parsing and normalisation
"""

__license__ = """
Expand Down Expand Up @@ -156,12 +156,12 @@ def norm_tuple(scheme, authority, path, parameters, query, fragment):
if not scheme:
raise InvalidUrl('missing URL scheme')
authority = norm_netloc(scheme, authority)
if not authority:
if not authority and scheme != 'mailto':
raise InvalidUrl('missing netloc')
path = norm_path(scheme, path)
# TODO: put query in sorted order; or at least group parameters together
# Note that some websites use positional parameters or the name part of a query so this would break the internet
# query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1)
# query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1)
parameters = unquote_params(parameters)
query = unquote_qs(query)
fragment = unquote_fragment(fragment)
Expand Down Expand Up @@ -190,14 +190,14 @@ def int2ip(ipnum):
ip3 = ipnum >> 8 & 0xFF
ip4 = ipnum & 0xFF
return "%d.%d.%d.%d" % (ip1, ip2, ip3, ip4)

def norm_netloc(scheme, netloc):
if not netloc:
return netloc
match = _server_authority.match(netloc)
if not match:
raise InvalidUrl('no host in netloc %r' % netloc)

userinfo, host, port = match.groups()
# catch a few common errors:
if host.isdigit():
Expand All @@ -207,16 +207,16 @@ def norm_netloc(scheme, netloc):
raise InvalidUrl('host %r does not escape to a valid ip' % host)
if host[-1] == '.':
host = host[:-1]

# bracket check is for ipv6 hosts
if '.' not in host and not (host[0] == '[' and host[-1] == ']'):
if not host or ('.' not in host and not (host[0] == '[' and host[-1] == ']')):
raise InvalidUrl('host %r is not valid' % host)

authority = lower(host)
if 'xn--' in authority:
subdomains = [_idn(subdomain) for subdomain in authority.split('.')]
authority = '.'.join(subdomains)

if userinfo:
authority = "%s@%s" % (userinfo, authority)
if port and port != _default_port.get(scheme, None):
Expand Down