Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pip-log.txt
build
dist
MANIFEST

__pycache__/
*.pyc
.pytest_cache/
.tox/
__pycache__/
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
from distutils.core import setup

# also update in urlnorm.py
version = '1.1.2.pinterest3'
version = '1.1.2.pinterest4'

setup(name='urlnorm',
version=version,
long_description=open("./README.txt", "r").read(),
description="Normalize a URL to a standard unicode encoding",
py_modules=['urlnorm'],
license='MIT License',
install_requires=['six'],
author='Jehiah Czebotar',
author_email='jehiah@gmail.com',
url='http://github.com/jehiah/urlnorm',
download_url="http://github.com/downloads/jehiah/urlnorm/urlnorm-%s.tar.gz" % version,
classifiers=[
'Operating System :: OS Independent',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
],
)
15 changes: 9 additions & 6 deletions test_urlnorm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf8 -*-
"""
this is a py.test test file
"""
from __future__ import print_function
import urlnorm
from urlnorm import _unicode

Expand All @@ -20,7 +22,7 @@ def pytest_generate_tests(metafunc):
'http://USER:pass@www.Example.COM/foo/bar': 'http://USER:pass@www.example.com/foo/bar',
'http://www.example.com./': 'http://www.example.com/',
'http://test.example/?a=%26&b=1': 'http://test.example/?a=%26&b=1', # should not un-encode the & that is part of a parameter value
'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character
'http://test.example/?a=%e3%82%82%26': u'http://test.example/?a=\u3082%26', # should return a unicode character
# note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign
# 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order

Expand All @@ -29,19 +31,20 @@ def pytest_generate_tests(metafunc):
'http://test.example?': 'http://test.example/', # with trailing /
'http://a.COM/path/?b&a' : 'http://a.com/path/?b&a',
# test utf8 and unicode
u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
u'http://XBLA\u306eXbox.com'.encode('utf8'): 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
u'http://XBLA\u306eXbox.com': u'http://xbla\u306exbox.com/',
u'http://XBLA\u306eXbox.com'.encode('utf8'): u'http://xbla\u306exbox.com/',
u'http://XBLA\u306eXbox.com': u'http://xbla\u306exbox.com/',
# test idna + utf8 domain
# u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'),
'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'),
'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': u'http://ja.wikipedia.org/wiki/\u30ad\u30e3\u30bf\u30d4\u30e9\u30fc\u30b8\u30e3\u30d1\u30f3',
'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad',

# check that %23 (#) is not escaped where it shouldn't be
'http://test.example/?p=%23val#test-%23-val%25': 'http://test.example/?p=%23val#test-%23-val%25',
# check that %20 or %25 is not unescaped to ' ' or %
'http://test.example/%25/?p=%20val%20%25' : 'http://test.example/%25/?p=%20val%20%25',
"http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n",
"http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : u"http://test.domain/Iñtërnâtiôn�lizætiøn",
# check that spaces are collated to '+'
"http://test.example/path/with a%20space+/" : "http://test.example/path/with%20a%20space+/",
"http://[2001:db8:1f70::999:de8:7648:6e8]/test" : "http://[2001:db8:1f70::999:de8:7648:6e8]/test", #ipv6 address
Expand Down Expand Up @@ -107,7 +110,7 @@ def pytest_generate_tests(metafunc):
def test_invalid_urls(url):
try:
output = urlnorm.norm(url)
print '%r' % output
print('%r' % output)
except urlnorm.InvalidUrl:
return
assert 1 == 0, "this should have raised an InvalidUrl exception"
Expand Down
6 changes: 6 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[tox]
envlist=py27,py3

[testenv]
deps = pytest
commands = pytest test_urlnorm.py
36 changes: 20 additions & 16 deletions urlnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
- more fine-grained authority parsing and normalisation
"""

from __future__ import absolute_import
from six import unichr
import six
from six.moves import range
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you consolidate these six imports?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure why its done this way, but this is what python-modernize automatically did

__license__ = """
Copyright (c) 1999-2002 Mark Nottingham <mnot@pobox.com>
Copyright (c) 2010 Jehiah Czebotar <jehiah@gmail.com>
Expand All @@ -63,10 +67,9 @@
"""

# also update in setup.py
__version__ = "1.1.2.pinterest2"
__version__ = "1.1.2.pinterest4"

from urlparse import urlparse, urlunparse
from string import lower
from six.moves.urllib.parse import urlparse, urlunparse, unquote
import re

class InvalidUrl(Exception):
Expand Down Expand Up @@ -105,8 +108,8 @@ class InvalidUrl(Exception):
qs_unsafe_list = ' ?&=+%#'
fragment_unsafe_list = ' +%#'
path_unsafe_list = ' /?;%+#'
_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
_hextochr = dict((b'%02x' % i, six.int2byte(i)) for i in range(256))
_hextochr.update((b'%02X' % i, six.int2byte(i)) for i in range(256))

def unquote_path(s):
return unquote_safe(s, path_unsafe_list)
Expand All @@ -124,22 +127,23 @@ def unquote_safe(s, unsafe_list):
"""unquote percent escaped string except for percent escape sequences that are in unsafe_list"""
# note: this build utf8 raw strings ,then does a .decode('utf8') at the end.
# as a result it's doing .encode('utf8') on each block of the string as it's processed.
res = _utf8(s).split('%')
for i in xrange(1, len(res)):
unsafe_list = [_utf8(i) for i in unsafe_list]
res = _utf8(s).split(b'%')
for i in range(1, len(res)):
item = res[i]
try:
raw_chr = _hextochr[item[:2]]
if raw_chr in unsafe_list or ord(raw_chr) < 20:
# leave it unescaped (but uppercase the percent escape)
res[i] = '%' + item[:2].upper() + item[2:]
res[i] = b'%' + item[:2].upper() + item[2:]
else:
res[i] = raw_chr + item[2:]
except KeyError:
res[i] = '%' + item
res[i] = b'%' + item
except UnicodeDecodeError:
# note: i'm not sure what this does
res[i] = unichr(int(item[:2], 16)) + item[2:]
o = "".join(res)
o = b"".join(res)
return _unicode(o)

def norm(url):
Expand All @@ -151,7 +155,7 @@ def norm(url):

def norm_tuple(scheme, authority, path, parameters, query, fragment):
"""given individual url components, return its normalized form"""
scheme = lower(scheme)
scheme = scheme.lower()
if not scheme:
raise InvalidUrl('missing URL scheme')
authority = norm_netloc(scheme, authority)
Expand Down Expand Up @@ -193,7 +197,7 @@ def norm_path(scheme, path):
return '/'
return path

MAX_IP=0xffffffffL
MAX_IP=0xffffffff
def int2ip(ipnum):
assert isinstance(ipnum, int)
if MAX_IP < ipnum or ipnum < 0:
Expand Down Expand Up @@ -221,7 +225,7 @@ def norm_netloc(scheme, netloc):
if host[-1] == '.':
host = host[:-1]

authority = lower(host)
authority = host.lower()
if 'xn--' in authority:
subdomains = [_idn(subdomain) for subdomain in authority.split('.')]
authority = '.'.join(subdomains)
Expand All @@ -243,14 +247,14 @@ def _idn(subdomain):


def _utf8(value):
if isinstance(value, unicode):
if isinstance(value, six.text_type):
return value.encode("utf-8")
assert isinstance(value, str)
return value


def _unicode(value):
if isinstance(value, str):
if isinstance(value, six.binary_type):
return value.decode("utf-8")
assert isinstance(value, unicode)
assert isinstance(value, six.text_type)
return value