diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..84d28ed --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.tox \ No newline at end of file diff --git a/README.rst b/README.rst index db68eb0..447de02 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ In order to use this tool to render wikitext into HTML in a Python program, you source += line wiki_content = wiki2html(source, True) - print wiki_content + print(wiki_content) Doc about Syntax diff --git a/mediawiki/__init__.py b/mediawiki/__init__.py index 9f54a16..62961ad 100644 --- a/mediawiki/__init__.py +++ b/mediawiki/__init__.py @@ -18,7 +18,9 @@ """ -from wiki import * +from __future__ import unicode_literals +from __future__ import absolute_import +from .wiki import * __author__ = "Raimon Esteve

Table of Contents

-

Basic Wiki Editing

+

Basic Wiki Ëditing

You can italicize text by putting 2 apostrophes on each side. 3 apostrophes will embolden the text. @@ -38,6 +38,7 @@

5 apostrophes will embolden and italicize the text.

(4 apostrophes don't do anything special -- there's just 'one left over'.) +

unicodË

You can give link to the other Web page over the Internet easily Visit Google @@ -102,7 +103,7 @@

Table

|}

- +
element? indent_level = 0 # indent level of the table - for k, x in zip(range(len(t)), t): + for k, x in zip(list(range(len(t))), t): x = x.strip() fc = x[0:1] matches = _zomgPat.match(x) @@ -1663,96 +1668,96 @@ def doTableStuff(self, text): attributes = self.unstripForHTML(matches.group(2)) - t[k] = u'
'*indent_level + u'' + t[k] = '
'*indent_level + '' td.append(False) - ltd.append(u'') + ltd.append('') tr.append(False) - ltr.append(u'') + ltr.append('') has_opened_tr.append(False) elif len(td) == 0: pass - elif u'|}' == x[0:2]: - z = u"
Web site Link diff --git a/mediawiki/wiki.py b/mediawiki/wiki.py index f1db94f..d49d61c 100644 --- a/mediawiki/wiki.py +++ b/mediawiki/wiki.py @@ -18,15 +18,11 @@ """ +from __future__ import unicode_literals +from __future__ import absolute_import import re -import random -import locale -from base64 import b64encode -from base64 import b64decode -from StringIO import StringIO - -import wikimarkup +from . import wikimarkup _image = re.compile(r'img:(.*)\.(.*)', re.UNICODE) _attach = re.compile(r'attach:(.*)\.(.*)', re.UNICODE) diff --git a/mediawiki/wikimarkup/__init__.py b/mediawiki/wikimarkup/__init__.py index b7d86b6..8676635 100644 --- a/mediawiki/wikimarkup/__init__.py +++ b/mediawiki/wikimarkup/__init__.py @@ -17,9 +17,14 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ - +from __future__ import unicode_literals +from __future__ import absolute_import +from collections import OrderedDict import re, random, locale from base64 import b64encode, b64decode +import six +from six.moves import range +from six.moves import zip # a few patterns we use later @@ -32,394 +37,394 @@ MW_COLON_STATE_COMMENTDASH = 6 MW_COLON_STATE_COMMENTDASHDASH = 7 -_attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE) -_space = re.compile(ur'\s+', re.UNICODE) -_closePrePat = re.compile(u"]*?)(/?>)([^<]*)$', re.UNICODE) +_attributePat = re.compile(r'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE) +_space = re.compile(r'\s+', re.UNICODE) +_closePrePat = re.compile("]*?)(/?>)([^<]*)$', re.UNICODE) _htmlpairs = ( # Tags that must be closed - u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1', - u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's', - u'strike', u'strong', u'tt', u'var', u'div', u'center', - u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre', - u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u', + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', ) _htmlsingle = ( - u'br', u'hr', u'li', u'dt', u'dd', u'img', + 'br', 'hr', 'li', 'dt', 'dd', 'img', ) _htmlsingleonly = ( # Elements that cannot have close tags - u'br', u'hr', u'img', + 'br', 'hr', 'img', ) _htmlnest = ( # Tags that can be nested--?? - u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul', - u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img', + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span', 'img', ) _tabletags = ( # Can only appear inside table - u'td', u'th', u'tr', + 'td', 'th', 'tr', ) _htmllist = ( # Tags used by list - u'ul', u'ol', + 'ul', 'ol', ) _listtags = ( # Tags that can appear in a list - u'li', + 'li', ) _htmlsingleallowed = _htmlsingle + _tabletags _htmlelements = _htmlsingle + _htmlpairs + _htmlnest _htmlEntities = { - u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180, - u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501, - u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197, - u'aring': 229, - u'asymp': 8776, - u'Atilde': 195, - u'atilde': 227, - u'Auml': 196, - u'auml': 228, - u'bdquo': 8222, - u'Beta': 914, - u'beta': 946, - u'brvbar': 166, - u'bull': 8226, - u'cap': 8745, - u'Ccedil': 199, - u'ccedil': 231, - u'cedil': 184, - u'cent': 162, - u'Chi': 935, - u'chi': 967, - u'circ': 710, - u'clubs': 9827, - u'cong': 8773, - u'copy': 169, - u'crarr': 8629, - u'cup': 8746, - u'curren': 164, - u'dagger': 8224, - u'Dagger': 8225, - u'darr': 8595, - u'dArr': 8659, - u'deg': 176, - u'Delta': 916, - u'delta': 948, - u'diams': 9830, - u'divide': 247, - u'Eacute': 201, - u'eacute': 233, - u'Ecirc': 202, - u'ecirc': 234, - u'Egrave': 200, - u'egrave': 232, - u'empty': 8709, - u'emsp': 8195, - u'ensp': 8194, - u'Epsilon': 917, - u'epsilon': 949, - u'equiv': 8801, - u'Eta': 919, - u'eta': 951, - u'ETH': 208, - u'eth': 240, - u'Euml': 203, - u'euml': 235, - u'euro': 8364, - u'exist': 8707, - u'fnof': 402, - u'forall': 8704, - u'frac12': 189, - u'frac14': 188, - u'frac34': 190, - u'frasl': 8260, - u'Gamma': 915, - u'gamma': 947, - u'ge': 8805, - u'gt': 62, - u'harr': 8596, - u'hArr': 8660, - u'hearts': 9829, - u'hellip': 8230, - u'Iacute': 205, - u'iacute': 237, - u'Icirc': 206, - u'icirc': 238, - u'iexcl': 161, - u'Igrave': 204, - u'igrave': 236, - u'image': 8465, - u'infin': 8734, - u'int': 8747, - u'Iota': 921, - u'iota': 953, - u'iquest': 191, - u'isin': 8712, - u'Iuml': 207, - u'iuml': 239, - u'Kappa': 922, - u'kappa': 954, - u'Lambda': 923, - u'lambda': 955, - u'lang': 9001, - u'laquo': 171, - u'larr': 8592, - u'lArr': 8656, - u'lceil': 8968, - u'ldquo': 8220, - u'le': 8804, - u'lfloor': 8970, - u'lowast': 8727, - u'loz': 9674, - u'lrm': 8206, - u'lsaquo': 8249, - u'lsquo': 8216, - u'lt': 60, - u'macr': 175, - u'mdash': 8212, - u'micro': 181, - u'middot': 183, - u'minus': 8722, - u'Mu': 924, - u'mu': 956, - u'nabla': 8711, - u'nbsp': 160, - u'ndash': 8211, - u'ne': 8800, - u'ni': 8715, - u'not': 172, - u'notin': 8713, - u'nsub': 8836, - u'Ntilde': 209, - u'ntilde': 241, - u'Nu': 925, - u'nu': 957, - u'Oacute': 211, - u'oacute': 243, - u'Ocirc': 212, - u'ocirc': 244, - u'OElig': 338, - u'oelig': 339, - u'Ograve': 210, - u'ograve': 242, - u'oline': 8254, - u'Omega': 937, - u'omega': 969, - u'Omicron': 927, - u'omicron': 959, - u'oplus': 8853, - u'or': 8744, - u'ordf': 170, - u'ordm': 186, - u'Oslash': 216, - u'oslash': 248, - u'Otilde': 213, - u'otilde': 245, - u'otimes': 8855, - u'Ouml': 214, - u'ouml': 246, - u'para': 182, - u'part': 8706, - u'permil': 8240, - u'perp': 8869, - u'Phi': 934, - u'phi': 966, - u'Pi': 928, - u'pi': 960, - u'piv': 982, - u'plusmn': 177, - u'pound': 163, - u'prime': 8242, - u'Prime': 8243, - u'prod': 8719, - u'prop': 8733, - u'Psi': 936, - u'psi': 968, - u'quot': 34, - u'radic': 8730, - u'rang': 9002, - u'raquo': 187, - u'rarr': 8594, - u'rArr': 8658, - u'rceil': 8969, - u'rdquo': 8221, - u'real': 8476, - u'reg': 174, - u'rfloor': 8971, - u'Rho': 929, - u'rho': 961, - u'rlm': 8207, - u'rsaquo': 8250, - u'rsquo': 8217, - u'sbquo': 8218, - u'Scaron': 352, - u'scaron': 353, - u'sdot': 8901, - u'sect': 167, - u'shy': 173, - u'Sigma': 931, - u'sigma': 963, - u'sigmaf': 962, - u'sim': 8764, - u'spades': 9824, - u'sub': 8834, - u'sube': 8838, - u'sum': 8721, - u'sup': 8835, - u'sup1': 185, - u'sup2': 178, - u'sup3': 179, - u'supe': 8839, - u'szlig': 223, - u'Tau': 932, - u'tau': 964, - u'there4': 8756, - u'Theta': 920, - u'theta': 952, - u'thetasym': 977, - u'thinsp': 8201, - u'THORN': 222, - u'thorn': 254, - u'tilde': 732, - u'times': 215, - u'trade': 8482, - u'Uacute': 218, - u'uacute': 250, - u'uarr': 8593, - u'uArr': 8657, - u'Ucirc': 219, - u'ucirc': 251, - u'Ugrave': 217, - u'ugrave': 249, - u'uml': 168, - u'upsih': 978, - u'Upsilon': 933, - u'upsilon': 965, - u'Uuml': 220, - u'uuml': 252, - u'weierp': 8472, - u'Xi': 926, - u'xi': 958, - u'Yacute': 221, - u'yacute': 253, - u'yen': 165, - u'Yuml': 376, - u'yuml': 255, - u'Zeta': 918, - u'zeta': 950, - u'zwj': 8205, - u'zwnj': 8204 + 'Aacute': 193, 'aacute': 225, 'Acirc': 194, 'acirc': 226, 'acute': 180, + 'AElig': 198, 'aelig': 230, 'Agrave': 192, 'agrave': 224, 'alefsym': 8501, + 'Alpha': 913, 'alpha': 945, 'amp': 38, 'and': 8743, 'ang': 8736, 'Aring': 197, + 'aring': 229, + 'asymp': 8776, + 'Atilde': 195, + 'atilde': 227, + 'Auml': 196, + 'auml': 228, + 'bdquo': 8222, + 'Beta': 914, + 'beta': 946, + 'brvbar': 166, + 'bull': 8226, + 'cap': 8745, + 'Ccedil': 199, + 'ccedil': 231, + 'cedil': 184, + 'cent': 162, + 'Chi': 935, + 'chi': 967, + 'circ': 710, + 'clubs': 9827, + 'cong': 8773, + 'copy': 169, + 'crarr': 8629, + 'cup': 8746, + 'curren': 164, + 'dagger': 8224, + 'Dagger': 8225, + 'darr': 8595, + 'dArr': 8659, + 'deg': 176, + 'Delta': 916, + 'delta': 948, + 'diams': 9830, + 'divide': 247, + 'Eacute': 201, + 'eacute': 233, + 'Ecirc': 202, + 'ecirc': 234, + 'Egrave': 200, + 'egrave': 232, + 'empty': 8709, + 'emsp': 8195, + 'ensp': 8194, + 'Epsilon': 917, + 'epsilon': 949, + 'equiv': 8801, + 'Eta': 919, + 'eta': 951, + 'ETH': 208, + 'eth': 240, + 'Euml': 203, + 'euml': 235, + 'euro': 8364, + 'exist': 8707, + 'fnof': 402, + 'forall': 8704, + 'frac12': 189, + 'frac14': 188, + 'frac34': 190, + 'frasl': 8260, + 'Gamma': 915, + 'gamma': 947, + 'ge': 8805, + 'gt': 62, + 'harr': 8596, + 'hArr': 8660, + 'hearts': 9829, + 'hellip': 8230, + 'Iacute': 205, + 'iacute': 237, + 'Icirc': 206, + 'icirc': 238, + 'iexcl': 161, + 'Igrave': 204, + 'igrave': 236, + 'image': 8465, + 'infin': 8734, + 'int': 8747, + 'Iota': 921, + 'iota': 953, + 'iquest': 191, + 'isin': 8712, + 'Iuml': 207, + 'iuml': 239, + 'Kappa': 922, + 'kappa': 954, + 'Lambda': 923, + 'lambda': 955, + 'lang': 9001, + 'laquo': 171, + 'larr': 8592, + 'lArr': 8656, + 'lceil': 8968, + 'ldquo': 8220, + 'le': 8804, + 'lfloor': 8970, + 'lowast': 8727, + 'loz': 9674, + 'lrm': 8206, + 'lsaquo': 8249, + 'lsquo': 8216, + 'lt': 60, + 'macr': 175, + 'mdash': 8212, + 'micro': 181, + 'middot': 183, + 'minus': 8722, + 'Mu': 924, + 'mu': 956, + 'nabla': 8711, + 'nbsp': 160, + 'ndash': 8211, + 'ne': 8800, + 'ni': 8715, + 'not': 172, + 'notin': 8713, + 'nsub': 8836, + 'Ntilde': 209, + 'ntilde': 241, + 'Nu': 925, + 'nu': 957, + 'Oacute': 211, + 'oacute': 243, + 'Ocirc': 212, + 'ocirc': 244, + 'OElig': 338, + 'oelig': 339, + 'Ograve': 210, + 'ograve': 242, + 'oline': 8254, + 'Omega': 937, + 'omega': 969, + 'Omicron': 927, + 'omicron': 959, + 'oplus': 8853, + 'or': 8744, + 'ordf': 170, + 'ordm': 186, + 'Oslash': 216, + 'oslash': 248, + 'Otilde': 213, + 'otilde': 245, + 'otimes': 8855, + 'Ouml': 214, + 'ouml': 246, + 'para': 182, + 'part': 8706, + 'permil': 8240, + 'perp': 8869, + 'Phi': 934, + 'phi': 966, + 'Pi': 928, + 'pi': 960, + 'piv': 982, + 'plusmn': 177, + 'pound': 163, + 'prime': 8242, + 'Prime': 8243, + 'prod': 8719, + 'prop': 8733, + 'Psi': 936, + 'psi': 968, + 'quot': 34, + 'radic': 8730, + 'rang': 9002, + 'raquo': 187, + 'rarr': 8594, + 'rArr': 8658, + 'rceil': 8969, + 'rdquo': 8221, + 'real': 8476, + 'reg': 174, + 'rfloor': 8971, + 'Rho': 929, + 'rho': 961, + 'rlm': 8207, + 'rsaquo': 8250, + 'rsquo': 8217, + 'sbquo': 8218, + 'Scaron': 352, + 'scaron': 353, + 'sdot': 8901, + 'sect': 167, + 'shy': 173, + 'Sigma': 931, + 'sigma': 963, + 'sigmaf': 962, + 'sim': 8764, + 'spades': 9824, + 'sub': 8834, + 'sube': 8838, + 'sum': 8721, + 'sup': 8835, + 'sup1': 185, + 'sup2': 178, + 'sup3': 179, + 'supe': 8839, + 'szlig': 223, + 'Tau': 932, + 'tau': 964, + 'there4': 8756, + 'Theta': 920, + 'theta': 952, + 'thetasym': 977, + 'thinsp': 8201, + 'THORN': 222, + 'thorn': 254, + 'tilde': 732, + 'times': 215, + 'trade': 8482, + 'Uacute': 218, + 'uacute': 250, + 'uarr': 8593, + 'uArr': 8657, + 'Ucirc': 219, + 'ucirc': 251, + 'Ugrave': 217, + 'ugrave': 249, + 'uml': 168, + 'upsih': 978, + 'Upsilon': 933, + 'upsilon': 965, + 'Uuml': 220, + 'uuml': 252, + 'weierp': 8472, + 'Xi': 926, + 'xi': 958, + 'Yacute': 221, + 'yacute': 253, + 'yen': 165, + 'Yuml': 376, + 'yuml': 255, + 'Zeta': 918, + 'zeta': 950, + 'zwj': 8205, + 'zwnj': 8204 } -_charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE) -_cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE) -_toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE) -_hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE) -_hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE) -_h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE) -_h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE) -_h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE) -_h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE) -_h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE) -_h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE) -_quotePat = re.compile(u"""(''+)""", re.UNICODE) -_removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from", - u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per", - u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via", - u"with")) + ur')\b', re.UNICODE | re.IGNORECASE) -_nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE) -_multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE) -_spacePat = re.compile(ur' ', re.UNICODE) -_linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL) -_bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE) -_protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE) -_specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE) -_protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE) -_controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE) -_hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE) -_stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE) -_zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE) -_headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)", re.UNICODE) -_templateSectionPat = re.compile(ur"", re.UNICODE) -_tagPat = re.compile(ur"<.*?>", re.UNICODE) +_charRefsPat = re.compile(r'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE) +_cssCommentPat = re.compile(r'''\*.*?\*''', re.UNICODE) +_toUTFPat = re.compile(r'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE) +_hackPat = re.compile(r'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE) +_hrPat = re.compile('''^-----*''', re.UNICODE | re.MULTILINE) +_h1Pat = re.compile('^=(.+)=\s*$', re.UNICODE | re.MULTILINE) +_h2Pat = re.compile('^==(.+)==\s*$', re.UNICODE | re.MULTILINE) +_h3Pat = re.compile('^===(.+)===\s*$', re.UNICODE | re.MULTILINE) +_h4Pat = re.compile('^====(.+)====\s*$', re.UNICODE | re.MULTILINE) +_h5Pat = re.compile('^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE) +_h6Pat = re.compile('^======(.+)======\s*$', re.UNICODE | re.MULTILINE) +_quotePat = re.compile("""(''+)""", re.UNICODE) +_removePat = re.compile(r'\b(' + r'|'.join(("a", "an", "as", "at", "before", "but", "by", "for", "from", + "is", "in", "into", "like", "of", "off", "on", "onto", "per", + "since", "than", "the", "this", "that", "to", "up", "via", + "with")) + r')\b', re.UNICODE | re.IGNORECASE) +_nonWordSpaceDashPat = re.compile(r'[^\w\s\-\./]', re.UNICODE) +_multiSpacePat = re.compile(r'[\s\-_\./]+', re.UNICODE) +_spacePat = re.compile(r' ', re.UNICODE) +_linkPat = re.compile(r'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL) +_bracketedLinkPat = re.compile(r'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + "\x00-\x20\x7f" + r']*)\s*(.*?)\])', re.UNICODE) +_protocolPat = re.compile(r'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE) +_specialUrlPat = re.compile(r'^([^<>\]\[' + "\x00-\x20\x7f" + r']+)(.*)$', re.UNICODE) +_protocolsPat = re.compile(r'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE) +_controlCharsPat = re.compile(r'[\]\[<>"' + "\\x00-\\x20\\x7F" + r']]', re.UNICODE) +_hostnamePat = re.compile(r'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE) +_stripPat = re.compile('\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE) +_zomgPat = re.compile(r'^(:*)\{\|(.*)$', re.UNICODE) +_headerPat = re.compile(r"<[Hh]([1-6])(.*?)>(.*?)", re.UNICODE) +_templateSectionPat = re.compile(r"", re.UNICODE) +_tagPat = re.compile(r"<.*?>", re.UNICODE) _startRegexHash = {} _endRegexHash = {} -_endCommentPat = re.compile(ur'(-->)', re.UNICODE) +_endCommentPat = re.compile(r'(-->)', re.UNICODE) _extractTagsAndParams_n = 1 -_guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE) -_guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE) +_guillemetLeftPat = re.compile(r'(.) (\?|:|;|!|\302\273)', re.UNICODE) +_guillemetRightPat = re.compile(r'(\302\253) ', re.UNICODE) def setupAttributeWhitelist(): - common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' ) - block = common + (u'align',) - tablealign = ( u'align', u'char', u'charoff', u'valign' ) - tablecell = ( u'abbr', - u'axis', - u'headers', - u'scope', - u'rowspan', - u'colspan', - u'nowrap', # deprecated - u'width', # deprecated - u'height', # deprecated - u'bgcolor' # deprecated + common = ( 'id', 'class', 'lang', 'dir', 'title', 'style' ) + block = common + ('align',) + tablealign = ( 'align', 'char', 'charoff', 'valign' ) + tablecell = ( 'abbr', + 'axis', + 'headers', + 'scope', + 'rowspan', + 'colspan', + 'nowrap', # deprecated + 'width', # deprecated + 'height', # deprecated + 'bgcolor' # deprecated ) return { - u'div': block, - u'center': common, # deprecated - u'span': block, # ?? - u'h1': block, - u'h2': block, - u'h3': block, - u'h4': block, - u'h5': block, - u'h6': block, - u'em': common, - u'strong': common, - u'cite': common, - u'code': common, - u'var': common, - u'img': common + (u'src', u'alt', u'width', u'height',), - u'blockquote': common + (u'cite',), - u'sub': common, - u'sup': common, - u'p': block, - u'br': (u'id', u'class', u'title', u'style', u'clear',), - u'pre': common + (u'width',), - u'ins': common + (u'cite', u'datetime'), - u'del': common + (u'cite', u'datetime'), - u'ul': common + (u'type',), - u'ol': common + (u'type', u'start'), - u'li': common + (u'type', u'value'), - u'dl': common, - u'dd': common, - u'dt': common, - u'table': common + ( u'summary', u'width', u'border', u'frame', - u'rules', u'cellspacing', u'cellpadding', - u'align', u'bgcolor', + 'div': block, + 'center': common, # deprecated + 'span': block, # ?? + 'h1': block, + 'h2': block, + 'h3': block, + 'h4': block, + 'h5': block, + 'h6': block, + 'em': common, + 'strong': common, + 'cite': common, + 'code': common, + 'var': common, + 'img': common + ('src', 'alt', 'width', 'height',), + 'blockquote': common + ('cite',), + 'sub': common, + 'sup': common, + 'p': block, + 'br': ('id', 'class', 'title', 'style', 'clear',), + 'pre': common + ('width',), + 'ins': common + ('cite', 'datetime'), + 'del': common + ('cite', 'datetime'), + 'ul': common + ('type',), + 'ol': common + ('type', 'start'), + 'li': common + ('type', 'value'), + 'dl': common, + 'dd': common, + 'dt': common, + 'table': common + ( 'summary', 'width', 'border', 'frame', + 'rules', 'cellspacing', 'cellpadding', + 'align', 'bgcolor', ), - u'caption': common + (u'align',), - u'thead': common + tablealign, - u'tfoot': common + tablealign, - u'tbody': common + tablealign, - u'colgroup': common + ( u'span', u'width' ) + tablealign, - u'col': common + ( u'span', u'width' ) + tablealign, - u'tr': common + ( u'bgcolor', ) + tablealign, - u'td': common + tablecell + tablealign, - u'th': common + tablecell + tablealign, - u'tt': common, - u'b': common, - u'i': common, - u'big': common, - u'small': common, - u'strike': common, - u's': common, - u'u': common, - u'font': common + ( u'size', u'color', u'face' ), - u'hr': common + ( u'noshade', u'size', u'width' ), - u'ruby': common, - u'rb': common, - u'rt': common, #array_merge( $common, array( 'rbspan' ) ), - u'rp': common, + 'caption': common + ('align',), + 'thead': common + tablealign, + 'tfoot': common + tablealign, + 'tbody': common + tablealign, + 'colgroup': common + ( 'span', 'width' ) + tablealign, + 'col': common + ( 'span', 'width' ) + tablealign, + 'tr': common + ( 'bgcolor', ) + tablealign, + 'td': common + tablecell + tablealign, + 'th': common + tablecell + tablealign, + 'tt': common, + 'b': common, + 'i': common, + 'big': common, + 'small': common, + 'strike': common, + 's': common, + 'u': common, + 'font': common + ( 'size', 'color', 'face' ), + 'hr': common + ( 'noshade', 'size', 'width' ), + 'ruby': common, + 'rb': common, + 'rt': common, #array_merge( $common, array( 'rbspan' ) ), + 'rp': common, } _whitelist = setupAttributeWhitelist() _page_cache = {} @@ -430,7 +435,7 @@ def registerTagHook(tag, function): class BaseParser(object): def __init__(self): - self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000)) + self.uniq_prefix = "\x07UNIQ" + six.text_type(random.randint(1, 1000000000)) self.strip_state = {} self.arg_stack = [] self.env = env @@ -466,8 +471,8 @@ def retrieve_object(self, namespace, key, default=None): def parse(self, text): utf8 = isinstance(text, str) text = to_unicode(text) - if text[-1:] != u'\n': - text = text + u'\n' + if text[-1:] != '\n': + text = text + '\n' taggedNewline = True else: taggedNewline = False @@ -481,9 +486,9 @@ def parse(self, text): text = self.fixtags(text) text = self.doBlockLevels(text, True) text = self.unstripNoWiki(text) - text = text.split(u'\n') - text = u'\n'.join(text) - if taggedNewline and text[-1:] == u'\n': + text = text.split('\n') + text = '\n'.join(text) + if taggedNewline and text[-1:] == '\n': text = text[:-1] if utf8: return text.encode("utf-8") @@ -494,7 +499,7 @@ def strip(self, text, stripcomments=False, dontstrip=[]): commentState = {} - elements = ['nowiki',] + mTagHooks.keys() + elements = ['nowiki',] + list(mTagHooks.keys()) if True: #wgRawHtml elements.append('html') @@ -510,20 +515,20 @@ def strip(self, text, stripcomments=False, dontstrip=[]): element, content, params, tag = matches[marker] if render: tagName = element.lower() - if tagName == u'!--': + if tagName == '!--': # comment output = tag - if tag[-3:] != u'-->': + if tag[-3:] != '-->': output += "-->" - elif tagName == u'html': + elif tagName == 'html': output = content - elif tagName == u'nowiki': - output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') + elif tagName == 'nowiki': + output = content.replace('&', '&').replace('<', '<').replace('>', '>') else: if tagName in mTagHooks: output = mTagHooks[tagName](self, content, params) else: - output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') + output = content.replace('&', '&').replace('<', '<').replace('>', '>') else: # Just stripping tags; keep the source output = tag @@ -532,9 +537,9 @@ def strip(self, text, stripcomments=False, dontstrip=[]): # it won't do it itself output = self.unstrip(output) - if not stripcomments and element == u'!--': + if not stripcomments and element == '!--': commentState[marker] = output - elif element == u'html' or element == u'nowiki': + elif element == 'html' or element == 'nowiki': if 'nowiki' not in self.strip_state: self.strip_state['nowiki'] = {} self.strip_state['nowiki'][marker] = output @@ -559,7 +564,7 @@ def removeHtmlTags(self, text): """convert bad tags into HTML identities""" sb = [] text = self.removeHtmlComments(text) - bits = text.split(u'<') + bits = text.split('<') sb.append(bits.pop(0)) tagstack = [] tablestack = tagstack @@ -600,97 +605,97 @@ def removeHtmlTags(self, text): #
  • can be nested in
      or
        , skip those cases: if ot not in _htmllist and t in _listtags: badtag = True - elif t == u'table': + elif t == 'table': if len(tablestack) == 0: bagtag = True else: tagstack = tablestack.pop() - newparams = u'' + newparams = '' else: # Keep track for later - if t in _tabletags and u'table' not in tagstack: + if t in _tabletags and 'table' not in tagstack: badtag = True elif t in tagstack and t not in _htmlnest: badtag = True # Is it a self-closed htmlpair? (bug 5487) - elif brace == u'/>' and t in _htmlpairs: + elif brace == '/>' and t in _htmlpairs: badTag = True elif t in _htmlsingleonly: # Hack to force empty tag for uncloseable elements - brace = u'/>' + brace = '/>' elif t in _htmlsingle: # Hack to not close $htmlsingle tags brace = None else: - if t == u'table': + if t == 'table': tablestack.append(tagstack) tagstack = [] tagstack.append(t) newparams = self.fixTagAttributes(params, t) if not badtag: - rest = rest.replace(u'>', u'>') - if brace == u'/>': - close = u' /' + rest = rest.replace('>', '>') + if brace == '/>': + close = ' /' else: - close = u'' - sb.append(u'<') + close = '' + sb.append('<') sb.append(slash) sb.append(t) sb.append(newparams) sb.append(close) - sb.append(u'>') + sb.append('>') sb.append(rest) continue - sb.append(u'<') - sb.append(x.replace(u'>', u'>')) + sb.append('<') + sb.append(x.replace('>', '>')) # Close off any remaining tags while tagstack: t = tagstack.pop() - sb.append(u'\n') - if t == u'table': + sb.append('>\n') + if t == 'table': if not tablestack: break tagstack = tablestack.pop() - return u''.join(sb) + return ''.join(sb) def removeHtmlComments(self, text): """remove comments from given text""" sb = [] - start = text.find(u'', start) + end = text.find('-->', start) if end == -1: break end += 3 spaceStart = max(0, start-1) spaceEnd = end - while text[spaceStart] == u' ' and spaceStart > 0: + while text[spaceStart] == ' ' and spaceStart > 0: spaceStart -= 1 - while text[spaceEnd] == u' ': + while text[spaceEnd] == ' ': spaceEnd += 1 - if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n': + if text[spaceStart] == '\n' and text[spaceEnd] == '\n': sb.append(text[last:spaceStart]) - sb.append(u'\n') + sb.append('\n') last = spaceEnd+1 else: sb.append(text[last:spaceStart+1]) last = spaceEnd - start = text.find(u'' + result += '' return result, mDTopen def nextItem(self, char, mDTopen): - if char == u'*' or char == '#': - return u'
      1. ', None - elif char == u':' or char == u';': - close = u'' + if char == '*' or char == '#': + return '
      2. ', None + elif char == ':' or char == ';': + close = '' if mDTopen: close = '' - if char == u';': - return close + u'
        ', True + if char == ';': + return close + '
        ', True else: - return close + u'
        ', False - return u'' + return close + '
        ', False + return '' def closeList(self, char, mDTopen): - if char == u'*': - return u'
    \n' - elif char == u'#': - return u'
  • \n' - elif char == u':': + if char == '*': + return '\n' + elif char == '#': + return '\n' + elif char == ':': if mDTopen: - return u'\n' + return '\n' else: - return u'\n' + return '\n' else: - return u'' + return '' def findColonNoLinks(self, text, before, after): try: @@ -1434,13 +1439,13 @@ def doBlockLevels(self, text, linestart): # Parsing through the text line by line. The main thing # happening here is handling of block-level elements p, pre, # and making lists from lines starting with * # : etc. - lastPrefix = u'' + lastPrefix = '' mDTopen = inBlockElem = False prefixLength = 0 paragraphStack = False - _closeMatchPat = re.compile(ur"(') + output.append(self.closeParagraph('') + '
    ')
     							mInPre = False
    -							mLastSection = u'pre'
    +							mLastSection = 'pre'
     						t = t[1:]
     					else:
     						# paragraph
    -						if t.strip() == u'':
    +						if t.strip() == '':
     							if paragraphStack:
    -								output.append(paragraphStack + u'
    ') + output.append(paragraphStack + '
    ') paragraphStack = False - mLastSection = u'p' + mLastSection = 'p' else: - if mLastSection != u'p': + if mLastSection != 'p': output.append(self.closeParagraph(mLastSection)) - mLastSection = u'' + mLastSection = '' mInPre = False - paragraphStack = u'

    ' + paragraphStack = '

    ' else: - paragraphStack = u'

    ' + paragraphStack = '

    ' else: if paragraphStack: output.append(paragraphStack) paragraphStack = False - mLastSection = u'p' - elif mLastSection != u'p': - output.append(self.closeParagraph(mLastSection) + u'

    ') - mLastSection = u'p' + mLastSection = 'p' + elif mLastSection != 'p': + output.append(self.closeParagraph(mLastSection) + '

    ') + mLastSection = 'p' mInPre = False # somewhere above we forget to get out of pre block (bug 785) @@ -1586,16 +1591,16 @@ def doBlockLevels(self, text, linestart): mInPre = False if paragraphStack == False: - output.append(t + u"\n") + output.append(t + "\n") while prefixLength: output.append(self.closeList(pref2[prefixLength-1], mDTopen)) mDTopen = False prefixLength -= 1 - if mLastSection != u'': - output.append(u'') - mLastSection = u'' + if mLastSection != '': + output.append('') + mLastSection = '' return ''.join(output) @@ -1605,10 +1610,10 @@ def __init__(self, show_toc=True): self.show_toc = show_toc def parse(self, text): - utf8 = isinstance(text, str) + utf8 = isinstance(text, six.binary_type) text = to_unicode(text) - if text[-1:] != u'\n': - text = text + u'\n' + if text[-1:] != '\n': + text = text + '\n' taggedNewline = True else: taggedNewline = False @@ -1621,32 +1626,32 @@ def parse(self, text): text = self.parseHeaders(text) text = self.parseAllQuotes(text) text = self.replaceExternalLinks(text) - if not self.show_toc and text.find(u"") == -1: + if not self.show_toc and text.find("") == -1: self.show_toc = False text = self.formatHeadings(text, True) text = self.unstrip(text) text = self.fixtags(text) text = self.doBlockLevels(text, True) text = self.unstripNoWiki(text) - text = text.split(u'\n') - text = u'\n'.join(text) - if taggedNewline and text[-1:] == u'\n': + text = text.split('\n') + text = '\n'.join(text) + if taggedNewline and text[-1:] == '\n': text = text[:-1] if utf8: return text.encode("utf-8") return text def checkTOC(self, text): - if text.find(u"__NOTOC__") != -1: - text = text.replace(u"__NOTOC__", u"") + if text.find("__NOTOC__") != -1: + text = text.replace("__NOTOC__", "") self.show_toc = False - if text.find(u"__TOC__") != -1: - text = text.replace(u"__TOC__", u"") + if text.find("__TOC__") != -1: + text = text.replace("__TOC__", "") self.show_toc = True return text def doTableStuff(self, text): - t = text.split(u"\n") + t = text.split("\n") td = [] # Is currently a td tag open? ltd = [] # Was it TD or TH? tr = [] # Is currently a tr tag open? @@ -1654,7 +1659,7 @@ def doTableStuff(self, text): has_opened_tr = [] # Did this table open a

    " + x[2:] + elif '|}' == x[0:2]: + z = "" + x[2:] l = ltd.pop() if not has_opened_tr.pop(): - z = u"" + z + z = "" + z if tr.pop(): - z = u"" + z + z = "" + z if td.pop(): - z = u'' + z + z = '' + z ltr.pop() - t[k] = z + u''*indent_level - elif u'|-' == x[0:2]: # Allows for |------------- + t[k] = z + ''*indent_level + elif '|-' == x[0:2]: # Allows for |------------- x = x[1:] - while x != u'' and x[0:1] == '-': + while x != '' and x[0:1] == '-': x = x[1:] z = '' l = ltd.pop() has_opened_tr.pop() has_opened_tr.append(True) if tr.pop(): - z = u'' + z + z = '' + z if td.pop(): - z = u'' + z + z = '' + z ltr.pop() t[k] = z tr.append(False) td.append(False) - ltd.append(u'') + ltd.append('') attributes = self.unstripForHTML(x) - ltr.append(self.fixTagAttributes(attributes, u'tr')) - elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption + ltr.append(self.fixTagAttributes(attributes, 'tr')) + elif '|' == fc or '!' == fc or '|+' == x[0:2]: # Caption # x is a table row - if u'|+' == x[0:2]: - fc = u'+' + if '|+' == x[0:2]: + fc = '+' x = x[1:] x = x[1:] - if fc == u'!': - x = x.replace(u'!!', u'||') + if fc == '!': + x = x.replace('!!', '||') # Split up multiple cells on the same line. # FIXME: This can result in improper nesting of tags processed # by earlier parser steps, but should avoid splitting up eg # attribute values containing literal "||". - x = x.split(u'||') + x = x.split('||') - t[k] = u'' + t[k] = '' # Loop through each table cell for theline in x: z = '' - if fc != u'+': + if fc != '+': tra = ltr.pop() if not tr.pop(): - z = u'\n' + z = '\n' tr.append(True) - ltr.append(u'') + ltr.append('') has_opened_tr.pop() has_opened_tr.append(True) l = ltd.pop() if td.pop(): - z = u'' + z - if fc == u'|': - l = u'td' - elif fc == u'!': - l = u'th' - elif fc == u'+': - l = u'caption' + z = '' + z + if fc == '|': + l = 'td' + elif fc == '!': + l = 'th' + elif fc == '+': + l = 'caption' else: - l = u'' + l = '' ltd.append(l) #Cell parameters - y = theline.split(u'|', 1) + y = theline.split('|', 1) # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters - if y[0].find(u'[[') != -1: + if y[0].find('[[') != -1: y = [theline] if len(y) == 1: - y = z + u"<" + l + u">" + y[0] + y = z + "<" + l + ">" + y[0] else: attributes = self.unstripForHTML(y[0]) - y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1] + y = z + "<" + l + self.fixTagAttributes(attributes, l) + ">" + y[1] t[k] += y td.append(True) @@ -1760,17 +1765,17 @@ def doTableStuff(self, text): while len(td) > 0: l = ltd.pop() if td.pop(): - t.append(u'') + t.append('') if tr.pop(): - t.append(u'') + t.append('') if not has_opened_tr.pop(): - t.append(u'') - t.append(u'') + t.append('') + t.append('') - text = u'\n'.join(t) + text = '\n'.join(t) # special case: don't return empty table - if text == u"\n\n
    ": - text = u'' + if text == "\n\n
    ": + text = '' return text @@ -1788,9 +1793,9 @@ def formatHeadings(self, text, isMain): doNumberHeadings = False showEditLink = True # Can User Edit - if text.find(u"__NOEDITSECTION__") != -1: + if text.find("__NOEDITSECTION__") != -1: showEditLink = False - text = text.replace(u"__NOEDITSECTION__", u"") + text = text.replace("__NOEDITSECTION__", "") # Get all headlines for numbering them and adding funky stuff like [edit] # links - this is for later, but we need the number of headlines right now @@ -1799,20 +1804,20 @@ def formatHeadings(self, text, isMain): # if there are fewer than 4 headlines in the article, do not show TOC # unless it's been explicitly enabled. - enoughToc = self.show_toc and (numMatches >= 4 or text.find(u"") != -1) + enoughToc = self.show_toc and (numMatches >= 4 or text.find("") != -1) # Allow user to stipulate that a page should have a "new section" # link added via __NEWSECTIONLINK__ showNewSection = False - if text.find(u"__NEWSECTIONLINK__") != -1: + if text.find("__NEWSECTIONLINK__") != -1: showNewSection = True - text = text.replace(u"__NEWSECTIONLINK__", u"") + text = text.replace("__NEWSECTIONLINK__", "") # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML, # override above conditions and always show TOC above first header - if text.find(u"__FORCETOC__") != -1: + if text.find("__FORCETOC__") != -1: self.show_toc = True enoughToc = True - text = text.replace(u"__FORCETOC__", u"") + text = text.replace("__FORCETOC__", "") # Never ever show TOC if no headers if numMatches < 1: enoughToc = False @@ -1839,7 +1844,7 @@ def formatHeadings(self, text, isMain): for match in matches: headline = match[2] istemplate = False - templatetitle = u'' + templatetitle = '' templatesection = 0 numbering = [] @@ -1848,20 +1853,20 @@ def formatHeadings(self, text, isMain): istemplate = True templatetitle = b64decode(m[0]) templatesection = 1 + int(b64decode(m[1])) - headline = _templateSectionPat.sub(u'', headline) + headline = _templateSectionPat.sub('', headline) if toclevel: prevlevel = level prevtoclevel = toclevel - level = matches[headlineCount][0] + level = int(matches[headlineCount][0]) if doNumberHeadings or enoughToc: if level > prevlevel: toclevel += 1 sublevelCount[toclevel] = 0 if toclevel < wgMaxTocLevel: - toc.append(u'\n
      ') + toc.append('\n
        ') elif level < prevlevel and toclevel > 1: # Decrease TOC level, find level to jump to @@ -1877,11 +1882,11 @@ def formatHeadings(self, text, isMain): toclevel = i + 1 break if toclevel < wgMaxTocLevel: - toc.append(u"\n") - toc.append(u"
      \n\n" * max(prevtoclevel - toclevel, 0)) + toc.append("\n") + toc.append("
    \n\n" * max(prevtoclevel - toclevel, 0)) else: if toclevel < wgMaxTocLevel: - toc.append(u"\n") + toc.append("\n") levelCount[toclevel] = level @@ -1909,7 +1914,7 @@ def formatHeadings(self, text, isMain): # $canonized_headline ); # strip out HTML - canonized_headline = _tagPat.sub(u'', canonized_headline) + canonized_headline = _tagPat.sub('', canonized_headline) tocline = canonized_headline.strip() # Save headline for section edit hint before it's escaped headline_hint = tocline @@ -1928,23 +1933,23 @@ def formatHeadings(self, text, isMain): # Don't number the heading if it is the only one (looks silly) if doNumberHeadings and numMatches > 1: # the two are different if the line contains a link - headline = numbering + u' ' + headline + headline = numbering + ' ' + headline # Create the anchor for linking from the TOC to the section anchor = canonized_headline; if refcount[headlineCount] > 1: - anchor += u'_' + unicode(refcount[headlineCount]) + anchor += '_' + six.text_type(refcount[headlineCount]) if enoughToc: - toc.append(u'\n
  • ') + toc.append('">') toc.append(numbering) - toc.append(u' ') + toc.append(' ') toc.append(tocline) - toc.append(u'') + toc.append('') # if showEditLink and (not istemplate or templatetitle != u""): # if not head[headlineCount]: @@ -1959,16 +1964,16 @@ def formatHeadings(self, text, isMain): if headlineCount not in head: head[headlineCount] = [] h = head[headlineCount] - h.append(u'') h.append(matches[headlineCount][1].strip()) h.append(headline.strip()) - h.append(u'') + h.append('>') headlineCount += 1 @@ -1977,12 +1982,12 @@ def formatHeadings(self, text, isMain): if enoughToc: if toclevel < wgMaxTocLevel: - toc.append(u"
  • \n") - toc.append(u"\n\n" * max(0, toclevel - 1)) + toc.append("\n") + toc.append("\n\n" * max(0, toclevel - 1)) #TODO: use gettext #toc.insert(0, u'

    ' + _('Table of Contents') + '

    ') - toc.insert(0, u'

    Table of Contents

    ') - toc.append(u'\n
    ') + toc.insert(0, '

    Table of Contents

    ') + toc.append('\n
    ') # split up and insert constructed headlines @@ -1990,7 +1995,7 @@ def formatHeadings(self, text, isMain): i = 0 len_blocks = len(blocks) - forceTocPosition = text.find(u"") + forceTocPosition = text.find("") full = [] while i < len_blocks: j = i/4 @@ -2002,9 +2007,9 @@ def formatHeadings(self, text, isMain): full += head[j] head[j] = None i += 4 - full = u''.join(full) + full = ''.join(full) if forceTocPosition != -1: - return full.replace(u"", u''.join(toc), 1) + return full.replace("", ''.join(toc), 1) else: return full @@ -2061,18 +2066,18 @@ def to_unicode(text, charset=None): # two possibilities for storing unicode strings in exception data: try: # custom __str__ method on the exception (e.g. PermissionError) - return unicode(text) + return six.text_type(text) except UnicodeError: # unicode arguments given to the exception (e.g. parse_date) return ' '.join([to_unicode(arg) for arg in text.args]) - return unicode(text) + return six.text_type(text) if charset: - return unicode(text, charset, 'replace') + return six.ensure_text(text, charset, 'replace') else: try: - return unicode(text, 'utf-8') + return six.ensure_text(text, 'utf-8') except UnicodeError: - return unicode(text, locale.getpreferredencoding(), 'replace') + return six.ensure_text(text, locale.getpreferredencoding(), 'replace') # tag hooks mTagHooks = {} @@ -2090,12 +2095,12 @@ def to_unicode(text, charset=None): from cgi import escape def hook_quote(env, body, attributes={}): - text = [u'
    '] + text = ['
    '] if 'cite' in attributes: - text.append(u"%s wrote:\n" % escape(attributes['cite'])) + text.append("%s wrote:\n" % escape(attributes['cite'])) text.append(body.strip()) - text.append(u'
    ') - return u'\n'.join(text) + text.append('
    ') + return '\n'.join(text) registerTagHook('quote', hook_quote) def safe_name(name=None, remove_slashes=True): @@ -2115,10 +2120,6 @@ def str2url(str): Takes a UTF-8 string and replaces all characters with the equivalent in 7-bit ASCII. It returns a plain ASCII string usable in URLs. """ - try: - str = str.encode('utf-8') - except: - pass mfrom = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîï" to = "AAAAAAECEEEEIIIIDNOOOOOOUUUUYSaaaaaaaceeeeiiii" mfrom += "ñòóôõöøùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ" diff --git a/setup.py b/setup.py index b019b85..b9f0d9f 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ to HTML ''' +from __future__ import unicode_literals +from __future__ import absolute_import from setuptools import setup import mediawiki @@ -29,6 +31,7 @@ zip_safe=False, platforms='any', install_requires=[ + 'six', ], classifiers=[ 'Development Status :: 4 - Beta', @@ -37,6 +40,7 @@ 'License :: OSI Approved :: GNU General Public License (GPL)', 'Operating System :: OS Independent', 'Programming Language :: Python', + 'Programming Language :: Python :: 3', 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 'Topic :: Software Development :: Libraries :: Python Modules' ], diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..677743a --- /dev/null +++ b/tox.ini @@ -0,0 +1,8 @@ +[tox] +envlist = py{27,37} + +[testenv] +deps = + six +commands = + /bin/sh -c 'cd mediawiki/doc && python generate_syntax_demo.py | diff -u syntax.html -'