diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..84d28ed
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.tox
\ No newline at end of file
diff --git a/README.rst b/README.rst
index db68eb0..447de02 100644
--- a/README.rst
+++ b/README.rst
@@ -20,7 +20,7 @@ In order to use this tool to render wikitext into HTML in a Python program, you
source += line
wiki_content = wiki2html(source, True)
- print wiki_content
+ print(wiki_content)
Doc about Syntax
diff --git a/mediawiki/__init__.py b/mediawiki/__init__.py
index 9f54a16..62961ad 100644
--- a/mediawiki/__init__.py
+++ b/mediawiki/__init__.py
@@ -18,7 +18,9 @@
"""
-from wiki import *
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from .wiki import *
__author__ = "Raimon Esteve Table of Contents
-Basic Wiki Editing
+Basic Wiki Ëditing
You can italicize text by putting 2
apostrophes on each side.
3 apostrophes will embolden the text.
@@ -38,6 +38,7 @@
5 apostrophes will embolden and italicize
the text.
(4 apostrophes don't do anything special -- there's just 'one left over'.)
+
unicodË
Links
You can give link to the other Web page over the Internet easily Visit Google
@@ -102,7 +103,7 @@
Table
|}
-
+
| Web site
| Link
diff --git a/mediawiki/wiki.py b/mediawiki/wiki.py
index f1db94f..d49d61c 100644
--- a/mediawiki/wiki.py
+++ b/mediawiki/wiki.py
@@ -18,15 +18,11 @@
"""
+from __future__ import unicode_literals
+from __future__ import absolute_import
import re
-import random
-import locale
-from base64 import b64encode
-from base64 import b64decode
-from StringIO import StringIO
-
-import wikimarkup
+from . import wikimarkup
_image = re.compile(r'img:(.*)\.(.*)', re.UNICODE)
_attach = re.compile(r'attach:(.*)\.(.*)', re.UNICODE)
diff --git a/mediawiki/wikimarkup/__init__.py b/mediawiki/wikimarkup/__init__.py
index b7d86b6..8676635 100644
--- a/mediawiki/wikimarkup/__init__.py
+++ b/mediawiki/wikimarkup/__init__.py
@@ -17,9 +17,14 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
"""
-
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from collections import OrderedDict
import re, random, locale
from base64 import b64encode, b64decode
+import six
+from six.moves import range
+from six.moves import zip
# a few patterns we use later
@@ -32,394 +37,394 @@
MW_COLON_STATE_COMMENTDASH = 6
MW_COLON_STATE_COMMENTDASHDASH = 7
-_attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
-_space = re.compile(ur'\s+', re.UNICODE)
-_closePrePat = re.compile(u"]*?)(/?>)([^<]*)$', re.UNICODE)
+_attributePat = re.compile(r'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
+_space = re.compile(r'\s+', re.UNICODE)
+_closePrePat = re.compile("]*?)(/?>)([^<]*)$', re.UNICODE)
_htmlpairs = ( # Tags that must be closed
- u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1',
- u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's',
- u'strike', u'strong', u'tt', u'var', u'div', u'center',
- u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre',
- u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u',
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+ 'strike', 'strong', 'tt', 'var', 'div', 'center',
+ 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u',
)
_htmlsingle = (
- u'br', u'hr', u'li', u'dt', u'dd', u'img',
+ 'br', 'hr', 'li', 'dt', 'dd', 'img',
)
_htmlsingleonly = ( # Elements that cannot have close tags
- u'br', u'hr', u'img',
+ 'br', 'hr', 'img',
)
_htmlnest = ( # Tags that can be nested--??
- u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul',
- u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img',
+ 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+ 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span', 'img',
)
_tabletags = ( # Can only appear inside table
- u'td', u'th', u'tr',
+ 'td', 'th', 'tr',
)
_htmllist = ( # Tags used by list
- u'ul', u'ol',
+ 'ul', 'ol',
)
_listtags = ( # Tags that can appear in a list
- u'li',
+ 'li',
)
_htmlsingleallowed = _htmlsingle + _tabletags
_htmlelements = _htmlsingle + _htmlpairs + _htmlnest
_htmlEntities = {
- u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
- u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
- u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
- u'aring': 229,
- u'asymp': 8776,
- u'Atilde': 195,
- u'atilde': 227,
- u'Auml': 196,
- u'auml': 228,
- u'bdquo': 8222,
- u'Beta': 914,
- u'beta': 946,
- u'brvbar': 166,
- u'bull': 8226,
- u'cap': 8745,
- u'Ccedil': 199,
- u'ccedil': 231,
- u'cedil': 184,
- u'cent': 162,
- u'Chi': 935,
- u'chi': 967,
- u'circ': 710,
- u'clubs': 9827,
- u'cong': 8773,
- u'copy': 169,
- u'crarr': 8629,
- u'cup': 8746,
- u'curren': 164,
- u'dagger': 8224,
- u'Dagger': 8225,
- u'darr': 8595,
- u'dArr': 8659,
- u'deg': 176,
- u'Delta': 916,
- u'delta': 948,
- u'diams': 9830,
- u'divide': 247,
- u'Eacute': 201,
- u'eacute': 233,
- u'Ecirc': 202,
- u'ecirc': 234,
- u'Egrave': 200,
- u'egrave': 232,
- u'empty': 8709,
- u'emsp': 8195,
- u'ensp': 8194,
- u'Epsilon': 917,
- u'epsilon': 949,
- u'equiv': 8801,
- u'Eta': 919,
- u'eta': 951,
- u'ETH': 208,
- u'eth': 240,
- u'Euml': 203,
- u'euml': 235,
- u'euro': 8364,
- u'exist': 8707,
- u'fnof': 402,
- u'forall': 8704,
- u'frac12': 189,
- u'frac14': 188,
- u'frac34': 190,
- u'frasl': 8260,
- u'Gamma': 915,
- u'gamma': 947,
- u'ge': 8805,
- u'gt': 62,
- u'harr': 8596,
- u'hArr': 8660,
- u'hearts': 9829,
- u'hellip': 8230,
- u'Iacute': 205,
- u'iacute': 237,
- u'Icirc': 206,
- u'icirc': 238,
- u'iexcl': 161,
- u'Igrave': 204,
- u'igrave': 236,
- u'image': 8465,
- u'infin': 8734,
- u'int': 8747,
- u'Iota': 921,
- u'iota': 953,
- u'iquest': 191,
- u'isin': 8712,
- u'Iuml': 207,
- u'iuml': 239,
- u'Kappa': 922,
- u'kappa': 954,
- u'Lambda': 923,
- u'lambda': 955,
- u'lang': 9001,
- u'laquo': 171,
- u'larr': 8592,
- u'lArr': 8656,
- u'lceil': 8968,
- u'ldquo': 8220,
- u'le': 8804,
- u'lfloor': 8970,
- u'lowast': 8727,
- u'loz': 9674,
- u'lrm': 8206,
- u'lsaquo': 8249,
- u'lsquo': 8216,
- u'lt': 60,
- u'macr': 175,
- u'mdash': 8212,
- u'micro': 181,
- u'middot': 183,
- u'minus': 8722,
- u'Mu': 924,
- u'mu': 956,
- u'nabla': 8711,
- u'nbsp': 160,
- u'ndash': 8211,
- u'ne': 8800,
- u'ni': 8715,
- u'not': 172,
- u'notin': 8713,
- u'nsub': 8836,
- u'Ntilde': 209,
- u'ntilde': 241,
- u'Nu': 925,
- u'nu': 957,
- u'Oacute': 211,
- u'oacute': 243,
- u'Ocirc': 212,
- u'ocirc': 244,
- u'OElig': 338,
- u'oelig': 339,
- u'Ograve': 210,
- u'ograve': 242,
- u'oline': 8254,
- u'Omega': 937,
- u'omega': 969,
- u'Omicron': 927,
- u'omicron': 959,
- u'oplus': 8853,
- u'or': 8744,
- u'ordf': 170,
- u'ordm': 186,
- u'Oslash': 216,
- u'oslash': 248,
- u'Otilde': 213,
- u'otilde': 245,
- u'otimes': 8855,
- u'Ouml': 214,
- u'ouml': 246,
- u'para': 182,
- u'part': 8706,
- u'permil': 8240,
- u'perp': 8869,
- u'Phi': 934,
- u'phi': 966,
- u'Pi': 928,
- u'pi': 960,
- u'piv': 982,
- u'plusmn': 177,
- u'pound': 163,
- u'prime': 8242,
- u'Prime': 8243,
- u'prod': 8719,
- u'prop': 8733,
- u'Psi': 936,
- u'psi': 968,
- u'quot': 34,
- u'radic': 8730,
- u'rang': 9002,
- u'raquo': 187,
- u'rarr': 8594,
- u'rArr': 8658,
- u'rceil': 8969,
- u'rdquo': 8221,
- u'real': 8476,
- u'reg': 174,
- u'rfloor': 8971,
- u'Rho': 929,
- u'rho': 961,
- u'rlm': 8207,
- u'rsaquo': 8250,
- u'rsquo': 8217,
- u'sbquo': 8218,
- u'Scaron': 352,
- u'scaron': 353,
- u'sdot': 8901,
- u'sect': 167,
- u'shy': 173,
- u'Sigma': 931,
- u'sigma': 963,
- u'sigmaf': 962,
- u'sim': 8764,
- u'spades': 9824,
- u'sub': 8834,
- u'sube': 8838,
- u'sum': 8721,
- u'sup': 8835,
- u'sup1': 185,
- u'sup2': 178,
- u'sup3': 179,
- u'supe': 8839,
- u'szlig': 223,
- u'Tau': 932,
- u'tau': 964,
- u'there4': 8756,
- u'Theta': 920,
- u'theta': 952,
- u'thetasym': 977,
- u'thinsp': 8201,
- u'THORN': 222,
- u'thorn': 254,
- u'tilde': 732,
- u'times': 215,
- u'trade': 8482,
- u'Uacute': 218,
- u'uacute': 250,
- u'uarr': 8593,
- u'uArr': 8657,
- u'Ucirc': 219,
- u'ucirc': 251,
- u'Ugrave': 217,
- u'ugrave': 249,
- u'uml': 168,
- u'upsih': 978,
- u'Upsilon': 933,
- u'upsilon': 965,
- u'Uuml': 220,
- u'uuml': 252,
- u'weierp': 8472,
- u'Xi': 926,
- u'xi': 958,
- u'Yacute': 221,
- u'yacute': 253,
- u'yen': 165,
- u'Yuml': 376,
- u'yuml': 255,
- u'Zeta': 918,
- u'zeta': 950,
- u'zwj': 8205,
- u'zwnj': 8204
+ 'Aacute': 193, 'aacute': 225, 'Acirc': 194, 'acirc': 226, 'acute': 180,
+ 'AElig': 198, 'aelig': 230, 'Agrave': 192, 'agrave': 224, 'alefsym': 8501,
+ 'Alpha': 913, 'alpha': 945, 'amp': 38, 'and': 8743, 'ang': 8736, 'Aring': 197,
+ 'aring': 229,
+ 'asymp': 8776,
+ 'Atilde': 195,
+ 'atilde': 227,
+ 'Auml': 196,
+ 'auml': 228,
+ 'bdquo': 8222,
+ 'Beta': 914,
+ 'beta': 946,
+ 'brvbar': 166,
+ 'bull': 8226,
+ 'cap': 8745,
+ 'Ccedil': 199,
+ 'ccedil': 231,
+ 'cedil': 184,
+ 'cent': 162,
+ 'Chi': 935,
+ 'chi': 967,
+ 'circ': 710,
+ 'clubs': 9827,
+ 'cong': 8773,
+ 'copy': 169,
+ 'crarr': 8629,
+ 'cup': 8746,
+ 'curren': 164,
+ 'dagger': 8224,
+ 'Dagger': 8225,
+ 'darr': 8595,
+ 'dArr': 8659,
+ 'deg': 176,
+ 'Delta': 916,
+ 'delta': 948,
+ 'diams': 9830,
+ 'divide': 247,
+ 'Eacute': 201,
+ 'eacute': 233,
+ 'Ecirc': 202,
+ 'ecirc': 234,
+ 'Egrave': 200,
+ 'egrave': 232,
+ 'empty': 8709,
+ 'emsp': 8195,
+ 'ensp': 8194,
+ 'Epsilon': 917,
+ 'epsilon': 949,
+ 'equiv': 8801,
+ 'Eta': 919,
+ 'eta': 951,
+ 'ETH': 208,
+ 'eth': 240,
+ 'Euml': 203,
+ 'euml': 235,
+ 'euro': 8364,
+ 'exist': 8707,
+ 'fnof': 402,
+ 'forall': 8704,
+ 'frac12': 189,
+ 'frac14': 188,
+ 'frac34': 190,
+ 'frasl': 8260,
+ 'Gamma': 915,
+ 'gamma': 947,
+ 'ge': 8805,
+ 'gt': 62,
+ 'harr': 8596,
+ 'hArr': 8660,
+ 'hearts': 9829,
+ 'hellip': 8230,
+ 'Iacute': 205,
+ 'iacute': 237,
+ 'Icirc': 206,
+ 'icirc': 238,
+ 'iexcl': 161,
+ 'Igrave': 204,
+ 'igrave': 236,
+ 'image': 8465,
+ 'infin': 8734,
+ 'int': 8747,
+ 'Iota': 921,
+ 'iota': 953,
+ 'iquest': 191,
+ 'isin': 8712,
+ 'Iuml': 207,
+ 'iuml': 239,
+ 'Kappa': 922,
+ 'kappa': 954,
+ 'Lambda': 923,
+ 'lambda': 955,
+ 'lang': 9001,
+ 'laquo': 171,
+ 'larr': 8592,
+ 'lArr': 8656,
+ 'lceil': 8968,
+ 'ldquo': 8220,
+ 'le': 8804,
+ 'lfloor': 8970,
+ 'lowast': 8727,
+ 'loz': 9674,
+ 'lrm': 8206,
+ 'lsaquo': 8249,
+ 'lsquo': 8216,
+ 'lt': 60,
+ 'macr': 175,
+ 'mdash': 8212,
+ 'micro': 181,
+ 'middot': 183,
+ 'minus': 8722,
+ 'Mu': 924,
+ 'mu': 956,
+ 'nabla': 8711,
+ 'nbsp': 160,
+ 'ndash': 8211,
+ 'ne': 8800,
+ 'ni': 8715,
+ 'not': 172,
+ 'notin': 8713,
+ 'nsub': 8836,
+ 'Ntilde': 209,
+ 'ntilde': 241,
+ 'Nu': 925,
+ 'nu': 957,
+ 'Oacute': 211,
+ 'oacute': 243,
+ 'Ocirc': 212,
+ 'ocirc': 244,
+ 'OElig': 338,
+ 'oelig': 339,
+ 'Ograve': 210,
+ 'ograve': 242,
+ 'oline': 8254,
+ 'Omega': 937,
+ 'omega': 969,
+ 'Omicron': 927,
+ 'omicron': 959,
+ 'oplus': 8853,
+ 'or': 8744,
+ 'ordf': 170,
+ 'ordm': 186,
+ 'Oslash': 216,
+ 'oslash': 248,
+ 'Otilde': 213,
+ 'otilde': 245,
+ 'otimes': 8855,
+ 'Ouml': 214,
+ 'ouml': 246,
+ 'para': 182,
+ 'part': 8706,
+ 'permil': 8240,
+ 'perp': 8869,
+ 'Phi': 934,
+ 'phi': 966,
+ 'Pi': 928,
+ 'pi': 960,
+ 'piv': 982,
+ 'plusmn': 177,
+ 'pound': 163,
+ 'prime': 8242,
+ 'Prime': 8243,
+ 'prod': 8719,
+ 'prop': 8733,
+ 'Psi': 936,
+ 'psi': 968,
+ 'quot': 34,
+ 'radic': 8730,
+ 'rang': 9002,
+ 'raquo': 187,
+ 'rarr': 8594,
+ 'rArr': 8658,
+ 'rceil': 8969,
+ 'rdquo': 8221,
+ 'real': 8476,
+ 'reg': 174,
+ 'rfloor': 8971,
+ 'Rho': 929,
+ 'rho': 961,
+ 'rlm': 8207,
+ 'rsaquo': 8250,
+ 'rsquo': 8217,
+ 'sbquo': 8218,
+ 'Scaron': 352,
+ 'scaron': 353,
+ 'sdot': 8901,
+ 'sect': 167,
+ 'shy': 173,
+ 'Sigma': 931,
+ 'sigma': 963,
+ 'sigmaf': 962,
+ 'sim': 8764,
+ 'spades': 9824,
+ 'sub': 8834,
+ 'sube': 8838,
+ 'sum': 8721,
+ 'sup': 8835,
+ 'sup1': 185,
+ 'sup2': 178,
+ 'sup3': 179,
+ 'supe': 8839,
+ 'szlig': 223,
+ 'Tau': 932,
+ 'tau': 964,
+ 'there4': 8756,
+ 'Theta': 920,
+ 'theta': 952,
+ 'thetasym': 977,
+ 'thinsp': 8201,
+ 'THORN': 222,
+ 'thorn': 254,
+ 'tilde': 732,
+ 'times': 215,
+ 'trade': 8482,
+ 'Uacute': 218,
+ 'uacute': 250,
+ 'uarr': 8593,
+ 'uArr': 8657,
+ 'Ucirc': 219,
+ 'ucirc': 251,
+ 'Ugrave': 217,
+ 'ugrave': 249,
+ 'uml': 168,
+ 'upsih': 978,
+ 'Upsilon': 933,
+ 'upsilon': 965,
+ 'Uuml': 220,
+ 'uuml': 252,
+ 'weierp': 8472,
+ 'Xi': 926,
+ 'xi': 958,
+ 'Yacute': 221,
+ 'yacute': 253,
+ 'yen': 165,
+ 'Yuml': 376,
+ 'yuml': 255,
+ 'Zeta': 918,
+ 'zeta': 950,
+ 'zwj': 8205,
+ 'zwnj': 8204
}
-_charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|([0-9]+);|[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)
-_cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE)
-_toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
-_hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)
-_hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE)
-_h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
-_h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
-_h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
-_h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
-_h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
-_h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
-_quotePat = re.compile(u"""(''+)""", re.UNICODE)
-_removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from",
- u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per",
- u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via",
- u"with")) + ur')\b', re.UNICODE | re.IGNORECASE)
-_nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE)
-_multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE)
-_spacePat = re.compile(ur' ', re.UNICODE)
-_linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
-_bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE)
-_protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE)
-_specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE)
-_protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE)
-_controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE)
-_hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
-_stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
-_zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE)
-_headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)[Hh][1-6] *>", re.UNICODE)
-_templateSectionPat = re.compile(ur"", re.UNICODE)
-_tagPat = re.compile(ur"<.*?>", re.UNICODE)
+_charRefsPat = re.compile(r'''(&([A-Za-z0-9]+);|([0-9]+);|[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)
+_cssCommentPat = re.compile(r'''\*.*?\*''', re.UNICODE)
+_toUTFPat = re.compile(r'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
+_hackPat = re.compile(r'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)
+_hrPat = re.compile('''^-----*''', re.UNICODE | re.MULTILINE)
+_h1Pat = re.compile('^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
+_h2Pat = re.compile('^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
+_h3Pat = re.compile('^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
+_h4Pat = re.compile('^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
+_h5Pat = re.compile('^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
+_h6Pat = re.compile('^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
+_quotePat = re.compile("""(''+)""", re.UNICODE)
+_removePat = re.compile(r'\b(' + r'|'.join(("a", "an", "as", "at", "before", "but", "by", "for", "from",
+ "is", "in", "into", "like", "of", "off", "on", "onto", "per",
+ "since", "than", "the", "this", "that", "to", "up", "via",
+ "with")) + r')\b', re.UNICODE | re.IGNORECASE)
+_nonWordSpaceDashPat = re.compile(r'[^\w\s\-\./]', re.UNICODE)
+_multiSpacePat = re.compile(r'[\s\-_\./]+', re.UNICODE)
+_spacePat = re.compile(r' ', re.UNICODE)
+_linkPat = re.compile(r'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
+_bracketedLinkPat = re.compile(r'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + "\x00-\x20\x7f" + r']*)\s*(.*?)\])', re.UNICODE)
+_protocolPat = re.compile(r'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE)
+_specialUrlPat = re.compile(r'^([^<>\]\[' + "\x00-\x20\x7f" + r']+)(.*)$', re.UNICODE)
+_protocolsPat = re.compile(r'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE)
+_controlCharsPat = re.compile(r'[\]\[<>"' + "\\x00-\\x20\\x7F" + r']]', re.UNICODE)
+_hostnamePat = re.compile(r'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
+_stripPat = re.compile('\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
+_zomgPat = re.compile(r'^(:*)\{\|(.*)$', re.UNICODE)
+_headerPat = re.compile(r"<[Hh]([1-6])(.*?)>(.*?)[Hh][1-6] *>", re.UNICODE)
+_templateSectionPat = re.compile(r"", re.UNICODE)
+_tagPat = re.compile(r"<.*?>", re.UNICODE)
_startRegexHash = {}
_endRegexHash = {}
-_endCommentPat = re.compile(ur'(-->)', re.UNICODE)
+_endCommentPat = re.compile(r'(-->)', re.UNICODE)
_extractTagsAndParams_n = 1
-_guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
-_guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
+_guillemetLeftPat = re.compile(r'(.) (\?|:|;|!|\302\273)', re.UNICODE)
+_guillemetRightPat = re.compile(r'(\302\253) ', re.UNICODE)
def setupAttributeWhitelist():
- common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' )
- block = common + (u'align',)
- tablealign = ( u'align', u'char', u'charoff', u'valign' )
- tablecell = ( u'abbr',
- u'axis',
- u'headers',
- u'scope',
- u'rowspan',
- u'colspan',
- u'nowrap', # deprecated
- u'width', # deprecated
- u'height', # deprecated
- u'bgcolor' # deprecated
+ common = ( 'id', 'class', 'lang', 'dir', 'title', 'style' )
+ block = common + ('align',)
+ tablealign = ( 'align', 'char', 'charoff', 'valign' )
+ tablecell = ( 'abbr',
+ 'axis',
+ 'headers',
+ 'scope',
+ 'rowspan',
+ 'colspan',
+ 'nowrap', # deprecated
+ 'width', # deprecated
+ 'height', # deprecated
+ 'bgcolor' # deprecated
)
return {
- u'div': block,
- u'center': common, # deprecated
- u'span': block, # ??
- u'h1': block,
- u'h2': block,
- u'h3': block,
- u'h4': block,
- u'h5': block,
- u'h6': block,
- u'em': common,
- u'strong': common,
- u'cite': common,
- u'code': common,
- u'var': common,
- u'img': common + (u'src', u'alt', u'width', u'height',),
- u'blockquote': common + (u'cite',),
- u'sub': common,
- u'sup': common,
- u'p': block,
- u'br': (u'id', u'class', u'title', u'style', u'clear',),
- u'pre': common + (u'width',),
- u'ins': common + (u'cite', u'datetime'),
- u'del': common + (u'cite', u'datetime'),
- u'ul': common + (u'type',),
- u'ol': common + (u'type', u'start'),
- u'li': common + (u'type', u'value'),
- u'dl': common,
- u'dd': common,
- u'dt': common,
- u'table': common + ( u'summary', u'width', u'border', u'frame',
- u'rules', u'cellspacing', u'cellpadding',
- u'align', u'bgcolor',
+ 'div': block,
+ 'center': common, # deprecated
+ 'span': block, # ??
+ 'h1': block,
+ 'h2': block,
+ 'h3': block,
+ 'h4': block,
+ 'h5': block,
+ 'h6': block,
+ 'em': common,
+ 'strong': common,
+ 'cite': common,
+ 'code': common,
+ 'var': common,
+ 'img': common + ('src', 'alt', 'width', 'height',),
+ 'blockquote': common + ('cite',),
+ 'sub': common,
+ 'sup': common,
+ 'p': block,
+ 'br': ('id', 'class', 'title', 'style', 'clear',),
+ 'pre': common + ('width',),
+ 'ins': common + ('cite', 'datetime'),
+ 'del': common + ('cite', 'datetime'),
+ 'ul': common + ('type',),
+ 'ol': common + ('type', 'start'),
+ 'li': common + ('type', 'value'),
+ 'dl': common,
+ 'dd': common,
+ 'dt': common,
+ 'table': common + ( 'summary', 'width', 'border', 'frame',
+ 'rules', 'cellspacing', 'cellpadding',
+ 'align', 'bgcolor',
),
- u'caption': common + (u'align',),
- u'thead': common + tablealign,
- u'tfoot': common + tablealign,
- u'tbody': common + tablealign,
- u'colgroup': common + ( u'span', u'width' ) + tablealign,
- u'col': common + ( u'span', u'width' ) + tablealign,
- u'tr': common + ( u'bgcolor', ) + tablealign,
- u'td': common + tablecell + tablealign,
- u'th': common + tablecell + tablealign,
- u'tt': common,
- u'b': common,
- u'i': common,
- u'big': common,
- u'small': common,
- u'strike': common,
- u's': common,
- u'u': common,
- u'font': common + ( u'size', u'color', u'face' ),
- u'hr': common + ( u'noshade', u'size', u'width' ),
- u'ruby': common,
- u'rb': common,
- u'rt': common, #array_merge( $common, array( 'rbspan' ) ),
- u'rp': common,
+ 'caption': common + ('align',),
+ 'thead': common + tablealign,
+ 'tfoot': common + tablealign,
+ 'tbody': common + tablealign,
+ 'colgroup': common + ( 'span', 'width' ) + tablealign,
+ 'col': common + ( 'span', 'width' ) + tablealign,
+ 'tr': common + ( 'bgcolor', ) + tablealign,
+ 'td': common + tablecell + tablealign,
+ 'th': common + tablecell + tablealign,
+ 'tt': common,
+ 'b': common,
+ 'i': common,
+ 'big': common,
+ 'small': common,
+ 'strike': common,
+ 's': common,
+ 'u': common,
+ 'font': common + ( 'size', 'color', 'face' ),
+ 'hr': common + ( 'noshade', 'size', 'width' ),
+ 'ruby': common,
+ 'rb': common,
+ 'rt': common, #array_merge( $common, array( 'rbspan' ) ),
+ 'rp': common,
}
_whitelist = setupAttributeWhitelist()
_page_cache = {}
@@ -430,7 +435,7 @@ def registerTagHook(tag, function):
class BaseParser(object):
def __init__(self):
- self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))
+ self.uniq_prefix = "\x07UNIQ" + six.text_type(random.randint(1, 1000000000))
self.strip_state = {}
self.arg_stack = []
self.env = env
@@ -466,8 +471,8 @@ def retrieve_object(self, namespace, key, default=None):
def parse(self, text):
utf8 = isinstance(text, str)
text = to_unicode(text)
- if text[-1:] != u'\n':
- text = text + u'\n'
+ if text[-1:] != '\n':
+ text = text + '\n'
taggedNewline = True
else:
taggedNewline = False
@@ -481,9 +486,9 @@ def parse(self, text):
text = self.fixtags(text)
text = self.doBlockLevels(text, True)
text = self.unstripNoWiki(text)
- text = text.split(u'\n')
- text = u'\n'.join(text)
- if taggedNewline and text[-1:] == u'\n':
+ text = text.split('\n')
+ text = '\n'.join(text)
+ if taggedNewline and text[-1:] == '\n':
text = text[:-1]
if utf8:
return text.encode("utf-8")
@@ -494,7 +499,7 @@ def strip(self, text, stripcomments=False, dontstrip=[]):
commentState = {}
- elements = ['nowiki',] + mTagHooks.keys()
+ elements = ['nowiki',] + list(mTagHooks.keys())
if True: #wgRawHtml
elements.append('html')
@@ -510,20 +515,20 @@ def strip(self, text, stripcomments=False, dontstrip=[]):
element, content, params, tag = matches[marker]
if render:
tagName = element.lower()
- if tagName == u'!--':
+ if tagName == '!--':
# comment
output = tag
- if tag[-3:] != u'-->':
+ if tag[-3:] != '-->':
output += "-->"
- elif tagName == u'html':
+ elif tagName == 'html':
output = content
- elif tagName == u'nowiki':
- output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
+ elif tagName == 'nowiki':
+ output = content.replace('&', '&').replace('<', '<').replace('>', '>')
else:
if tagName in mTagHooks:
output = mTagHooks[tagName](self, content, params)
else:
- output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
+ output = content.replace('&', '&').replace('<', '<').replace('>', '>')
else:
# Just stripping tags; keep the source
output = tag
@@ -532,9 +537,9 @@ def strip(self, text, stripcomments=False, dontstrip=[]):
# it won't do it itself
output = self.unstrip(output)
- if not stripcomments and element == u'!--':
+ if not stripcomments and element == '!--':
commentState[marker] = output
- elif element == u'html' or element == u'nowiki':
+ elif element == 'html' or element == 'nowiki':
if 'nowiki' not in self.strip_state:
self.strip_state['nowiki'] = {}
self.strip_state['nowiki'][marker] = output
@@ -559,7 +564,7 @@ def removeHtmlTags(self, text):
"""convert bad tags into HTML identities"""
sb = []
text = self.removeHtmlComments(text)
- bits = text.split(u'<')
+ bits = text.split('<')
sb.append(bits.pop(0))
tagstack = []
tablestack = tagstack
@@ -600,97 +605,97 @@ def removeHtmlTags(self, text):
# can be nested in or , skip those cases:
if ot not in _htmllist and t in _listtags:
badtag = True
- elif t == u'table':
+ elif t == 'table':
if len(tablestack) == 0:
bagtag = True
else:
tagstack = tablestack.pop()
- newparams = u''
+ newparams = ''
else:
# Keep track for later
- if t in _tabletags and u'table' not in tagstack:
+ if t in _tabletags and 'table' not in tagstack:
badtag = True
elif t in tagstack and t not in _htmlnest:
badtag = True
# Is it a self-closed htmlpair? (bug 5487)
- elif brace == u'/>' and t in _htmlpairs:
+ elif brace == '/>' and t in _htmlpairs:
badTag = True
elif t in _htmlsingleonly:
# Hack to force empty tag for uncloseable elements
- brace = u'/>'
+ brace = '/>'
elif t in _htmlsingle:
# Hack to not close $htmlsingle tags
brace = None
else:
- if t == u'table':
+ if t == 'table':
tablestack.append(tagstack)
tagstack = []
tagstack.append(t)
newparams = self.fixTagAttributes(params, t)
if not badtag:
- rest = rest.replace(u'>', u'>')
- if brace == u'/>':
- close = u' /'
+ rest = rest.replace('>', '>')
+ if brace == '/>':
+ close = ' /'
else:
- close = u''
- sb.append(u'<')
+ close = ''
+ sb.append('<')
sb.append(slash)
sb.append(t)
sb.append(newparams)
sb.append(close)
- sb.append(u'>')
+ sb.append('>')
sb.append(rest)
continue
- sb.append(u'<')
- sb.append(x.replace(u'>', u'>'))
+ sb.append('<')
+ sb.append(x.replace('>', '>'))
# Close off any remaining tags
while tagstack:
t = tagstack.pop()
- sb.append(u'')
+ sb.append('')
sb.append(t)
- sb.append(u'>\n')
- if t == u'table':
+ sb.append('>\n')
+ if t == 'table':
if not tablestack:
break
tagstack = tablestack.pop()
- return u''.join(sb)
+ return ''.join(sb)
def removeHtmlComments(self, text):
"""remove comments from given text"""
sb = []
- start = text.find(u'', start)
+ end = text.find('-->', start)
if end == -1:
break
end += 3
spaceStart = max(0, start-1)
spaceEnd = end
- while text[spaceStart] == u' ' and spaceStart > 0:
+ while text[spaceStart] == ' ' and spaceStart > 0:
spaceStart -= 1
- while text[spaceEnd] == u' ':
+ while text[spaceEnd] == ' ':
spaceEnd += 1
- if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n':
+ if text[spaceStart] == '\n' and text[spaceEnd] == '\n':
sb.append(text[last:spaceStart])
- sb.append(u'\n')
+ sb.append('\n')
last = spaceEnd+1
else:
sb.append(text[last:spaceStart+1])
last = spaceEnd
- start = text.find(u''
+ result += ''
return result, mDTopen
def nextItem(self, char, mDTopen):
- if char == u'*' or char == '#':
- return u' ', None
- elif char == u':' or char == u';':
- close = u''
+ if char == '*' or char == '#':
+ return '', None
+ elif char == ':' or char == ';':
+ close = ''
if mDTopen:
close = ''
- if char == u';':
- return close + u'', True
+ if char == ';':
+ return close + '', True
else:
- return close + u'', False
- return u''
+ return close + '', False
+ return ''
def closeList(self, char, mDTopen):
- if char == u'*':
- return u'\n'
- elif char == u'#':
- return u'\n'
- elif char == u':':
+ if char == '*':
+ return '\n'
+ elif char == '#':
+ return '\n'
+ elif char == ':':
if mDTopen:
- return u'\n'
+ return '\n'
else:
- return u'\n'
+ return '\n'
else:
- return u''
+ return ''
def findColonNoLinks(self, text, before, after):
try:
@@ -1434,13 +1439,13 @@ def doBlockLevels(self, text, linestart):
# Parsing through the text line by line. The main thing
# happening here is handling of block-level elements p, pre,
# and making lists from lines starting with * # : etc.
- lastPrefix = u''
+ lastPrefix = ''
mDTopen = inBlockElem = False
prefixLength = 0
paragraphStack = False
- _closeMatchPat = re.compile(ur"( |
')
+ output.append(self.closeParagraph('') + '')
mInPre = False
- mLastSection = u'pre'
+ mLastSection = 'pre'
t = t[1:]
else:
# paragraph
- if t.strip() == u'':
+ if t.strip() == '':
if paragraphStack:
- output.append(paragraphStack + u'
')
+ output.append(paragraphStack + '
')
paragraphStack = False
- mLastSection = u'p'
+ mLastSection = 'p'
else:
- if mLastSection != u'p':
+ if mLastSection != 'p':
output.append(self.closeParagraph(mLastSection))
- mLastSection = u''
+ mLastSection = ''
mInPre = False
- paragraphStack = u''
+ paragraphStack = '
'
else:
- paragraphStack = u'
'
+ paragraphStack = '
'
else:
if paragraphStack:
output.append(paragraphStack)
paragraphStack = False
- mLastSection = u'p'
- elif mLastSection != u'p':
- output.append(self.closeParagraph(mLastSection) + u'
')
- mLastSection = u'p'
+ mLastSection = 'p'
+ elif mLastSection != 'p':
+ output.append(self.closeParagraph(mLastSection) + '
')
+ mLastSection = 'p'
mInPre = False
# somewhere above we forget to get out of pre block (bug 785)
@@ -1586,16 +1591,16 @@ def doBlockLevels(self, text, linestart):
mInPre = False
if paragraphStack == False:
- output.append(t + u"\n")
+ output.append(t + "\n")
while prefixLength:
output.append(self.closeList(pref2[prefixLength-1], mDTopen))
mDTopen = False
prefixLength -= 1
- if mLastSection != u'':
- output.append(u'' + mLastSection + u'>')
- mLastSection = u''
+ if mLastSection != '':
+ output.append('' + mLastSection + '>')
+ mLastSection = ''
return ''.join(output)
@@ -1605,10 +1610,10 @@ def __init__(self, show_toc=True):
self.show_toc = show_toc
def parse(self, text):
- utf8 = isinstance(text, str)
+ utf8 = isinstance(text, six.binary_type)
text = to_unicode(text)
- if text[-1:] != u'\n':
- text = text + u'\n'
+ if text[-1:] != '\n':
+ text = text + '\n'
taggedNewline = True
else:
taggedNewline = False
@@ -1621,32 +1626,32 @@ def parse(self, text):
text = self.parseHeaders(text)
text = self.parseAllQuotes(text)
text = self.replaceExternalLinks(text)
- if not self.show_toc and text.find(u"") == -1:
+ if not self.show_toc and text.find("") == -1:
self.show_toc = False
text = self.formatHeadings(text, True)
text = self.unstrip(text)
text = self.fixtags(text)
text = self.doBlockLevels(text, True)
text = self.unstripNoWiki(text)
- text = text.split(u'\n')
- text = u'\n'.join(text)
- if taggedNewline and text[-1:] == u'\n':
+ text = text.split('\n')
+ text = '\n'.join(text)
+ if taggedNewline and text[-1:] == '\n':
text = text[:-1]
if utf8:
return text.encode("utf-8")
return text
def checkTOC(self, text):
- if text.find(u"__NOTOC__") != -1:
- text = text.replace(u"__NOTOC__", u"")
+ if text.find("__NOTOC__") != -1:
+ text = text.replace("__NOTOC__", "")
self.show_toc = False
- if text.find(u"__TOC__") != -1:
- text = text.replace(u"__TOC__", u"")
+ if text.find("__TOC__") != -1:
+ text = text.replace("__TOC__", "")
self.show_toc = True
return text
def doTableStuff(self, text):
- t = text.split(u"\n")
+ t = text.split("\n")
td = [] # Is currently a td tag open?
ltd = [] # Was it TD or TH?
tr = [] # Is currently a tr tag open?
@@ -1654,7 +1659,7 @@ def doTableStuff(self, text):
has_opened_tr = [] # Did this table open a
element?
indent_level = 0 # indent level of the table
- for k, x in zip(range(len(t)), t):
+ for k, x in zip(list(range(len(t))), t):
x = x.strip()
fc = x[0:1]
matches = _zomgPat.match(x)
@@ -1663,96 +1668,96 @@ def doTableStuff(self, text):
attributes = self.unstripForHTML(matches.group(2))
- t[k] = u'- '*indent_level + u'
'
+ t[k] = '- '*indent_level + '
'
td.append(False)
- ltd.append(u'')
+ ltd.append('')
tr.append(False)
- ltr.append(u'')
+ ltr.append('')
has_opened_tr.append(False)
elif len(td) == 0:
pass
- elif u'|}' == x[0:2]:
- z = u"
" + x[2:]
+ elif '|}' == x[0:2]:
+ z = "
" + x[2:]
l = ltd.pop()
if not has_opened_tr.pop():
- z = u" |
" + z
+ z = "
|
" + z
if tr.pop():
- z = u"
" + z
+ z = "
" + z
if td.pop():
- z = u'' + l + u'>' + z
+ z = '' + l + '>' + z
ltr.pop()
- t[k] = z + u''*indent_level
- elif u'|-' == x[0:2]: # Allows for |-------------
+ t[k] = z + ''*indent_level
+ elif '|-' == x[0:2]: # Allows for |-------------
x = x[1:]
- while x != u'' and x[0:1] == '-':
+ while x != '' and x[0:1] == '-':
x = x[1:]
z = ''
l = ltd.pop()
has_opened_tr.pop()
has_opened_tr.append(True)
if tr.pop():
- z = u'' + z
+ z = '' + z
if td.pop():
- z = u'' + l + u'>' + z
+ z = '' + l + '>' + z
ltr.pop()
t[k] = z
tr.append(False)
td.append(False)
- ltd.append(u'')
+ ltd.append('')
attributes = self.unstripForHTML(x)
- ltr.append(self.fixTagAttributes(attributes, u'tr'))
- elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption
+ ltr.append(self.fixTagAttributes(attributes, 'tr'))
+ elif '|' == fc or '!' == fc or '|+' == x[0:2]: # Caption
# x is a table row
- if u'|+' == x[0:2]:
- fc = u'+'
+ if '|+' == x[0:2]:
+ fc = '+'
x = x[1:]
x = x[1:]
- if fc == u'!':
- x = x.replace(u'!!', u'||')
+ if fc == '!':
+ x = x.replace('!!', '||')
# Split up multiple cells on the same line.
# FIXME: This can result in improper nesting of tags processed
# by earlier parser steps, but should avoid splitting up eg
# attribute values containing literal "||".
- x = x.split(u'||')
+ x = x.split('||')
- t[k] = u''
+ t[k] = ''
# Loop through each table cell
for theline in x:
z = ''
- if fc != u'+':
+ if fc != '+':
tra = ltr.pop()
if not tr.pop():
- z = u'\n'
+ z = '
\n'
tr.append(True)
- ltr.append(u'')
+ ltr.append('')
has_opened_tr.pop()
has_opened_tr.append(True)
l = ltd.pop()
if td.pop():
- z = u'' + l + u'>' + z
- if fc == u'|':
- l = u'td'
- elif fc == u'!':
- l = u'th'
- elif fc == u'+':
- l = u'caption'
+ z = '' + l + '>' + z
+ if fc == '|':
+ l = 'td'
+ elif fc == '!':
+ l = 'th'
+ elif fc == '+':
+ l = 'caption'
else:
- l = u''
+ l = ''
ltd.append(l)
#Cell parameters
- y = theline.split(u'|', 1)
+ y = theline.split('|', 1)
# Note that a '|' inside an invalid link should not
# be mistaken as delimiting cell parameters
- if y[0].find(u'[[') != -1:
+ if y[0].find('[[') != -1:
y = [theline]
if len(y) == 1:
- y = z + u"<" + l + u">" + y[0]
+ y = z + "<" + l + ">" + y[0]
else:
attributes = self.unstripForHTML(y[0])
- y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1]
+ y = z + "<" + l + self.fixTagAttributes(attributes, l) + ">" + y[1]
t[k] += y
td.append(True)
@@ -1760,17 +1765,17 @@ def doTableStuff(self, text):
while len(td) > 0:
l = ltd.pop()
if td.pop():
- t.append(u'')
+ t.append('')
if tr.pop():
- t.append(u'
')
+ t.append('')
if not has_opened_tr.pop():
- t.append(u' |
')
- t.append(u'
')
+ t.append(' |
')
+ t.append('')
- text = u'\n'.join(t)
+ text = '\n'.join(t)
# special case: don't return empty table
- if text == u"":
- text = u''
+ if text == "":
+ text = ''
return text
@@ -1788,9 +1793,9 @@ def formatHeadings(self, text, isMain):
doNumberHeadings = False
showEditLink = True # Can User Edit
- if text.find(u"__NOEDITSECTION__") != -1:
+ if text.find("__NOEDITSECTION__") != -1:
showEditLink = False
- text = text.replace(u"__NOEDITSECTION__", u"")
+ text = text.replace("__NOEDITSECTION__", "")
# Get all headlines for numbering them and adding funky stuff like [edit]
# links - this is for later, but we need the number of headlines right now
@@ -1799,20 +1804,20 @@ def formatHeadings(self, text, isMain):
# if there are fewer than 4 headlines in the article, do not show TOC
# unless it's been explicitly enabled.
- enoughToc = self.show_toc and (numMatches >= 4 or text.find(u"") != -1)
+ enoughToc = self.show_toc and (numMatches >= 4 or text.find("") != -1)
# Allow user to stipulate that a page should have a "new section"
# link added via __NEWSECTIONLINK__
showNewSection = False
- if text.find(u"__NEWSECTIONLINK__") != -1:
+ if text.find("__NEWSECTIONLINK__") != -1:
showNewSection = True
- text = text.replace(u"__NEWSECTIONLINK__", u"")
+ text = text.replace("__NEWSECTIONLINK__", "")
# if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
# override above conditions and always show TOC above first header
- if text.find(u"__FORCETOC__") != -1:
+ if text.find("__FORCETOC__") != -1:
self.show_toc = True
enoughToc = True
- text = text.replace(u"__FORCETOC__", u"")
+ text = text.replace("__FORCETOC__", "")
# Never ever show TOC if no headers
if numMatches < 1:
enoughToc = False
@@ -1839,7 +1844,7 @@ def formatHeadings(self, text, isMain):
for match in matches:
headline = match[2]
istemplate = False
- templatetitle = u''
+ templatetitle = ''
templatesection = 0
numbering = []
@@ -1848,20 +1853,20 @@ def formatHeadings(self, text, isMain):
istemplate = True
templatetitle = b64decode(m[0])
templatesection = 1 + int(b64decode(m[1]))
- headline = _templateSectionPat.sub(u'', headline)
+ headline = _templateSectionPat.sub('', headline)
if toclevel:
prevlevel = level
prevtoclevel = toclevel
- level = matches[headlineCount][0]
+ level = int(matches[headlineCount][0])
if doNumberHeadings or enoughToc:
if level > prevlevel:
toclevel += 1
sublevelCount[toclevel] = 0
if toclevel < wgMaxTocLevel:
- toc.append(u'\n')
+ toc.append('\n')
elif level < prevlevel and toclevel > 1:
# Decrease TOC level, find level to jump to
@@ -1877,11 +1882,11 @@ def formatHeadings(self, text, isMain):
toclevel = i + 1
break
if toclevel < wgMaxTocLevel:
- toc.append(u"\n")
- toc.append(u"
\n\n" * max(prevtoclevel - toclevel, 0))
+ toc.append("\n")
+ toc.append("
\n\n" * max(prevtoclevel - toclevel, 0))
else:
if toclevel < wgMaxTocLevel:
- toc.append(u"\n")
+ toc.append("\n")
levelCount[toclevel] = level
@@ -1909,7 +1914,7 @@ def formatHeadings(self, text, isMain):
# $canonized_headline );
# strip out HTML
- canonized_headline = _tagPat.sub(u'', canonized_headline)
+ canonized_headline = _tagPat.sub('', canonized_headline)
tocline = canonized_headline.strip()
# Save headline for section edit hint before it's escaped
headline_hint = tocline
@@ -1928,23 +1933,23 @@ def formatHeadings(self, text, isMain):
# Don't number the heading if it is the only one (looks silly)
if doNumberHeadings and numMatches > 1:
# the two are different if the line contains a link
- headline = numbering + u' ' + headline
+ headline = numbering + ' ' + headline
# Create the anchor for linking from the TOC to the section
anchor = canonized_headline;
if refcount[headlineCount] > 1:
- anchor += u'_' + unicode(refcount[headlineCount])
+ anchor += '_' + six.text_type(refcount[headlineCount])
if enoughToc:
- toc.append(u'\n')
+ toc.append('">')
toc.append(numbering)
- toc.append(u' ')
+ toc.append(' ')
toc.append(tocline)
- toc.append(u'')
+ toc.append('')
# if showEditLink and (not istemplate or templatetitle != u""):
# if not head[headlineCount]:
@@ -1959,16 +1964,16 @@ def formatHeadings(self, text, isMain):
if headlineCount not in head:
head[headlineCount] = []
h = head[headlineCount]
- h.append(u'')
h.append(matches[headlineCount][1].strip())
h.append(headline.strip())
- h.append(u'')
+ h.append('>')
headlineCount += 1
@@ -1977,12 +1982,12 @@ def formatHeadings(self, text, isMain):
if enoughToc:
if toclevel < wgMaxTocLevel:
- toc.append(u"\n")
- toc.append(u"\n\n" * max(0, toclevel - 1))
+ toc.append("\n")
+ toc.append("\n\n" * max(0, toclevel - 1))
#TODO: use gettext
#toc.insert(0, u'' + _('Table of Contents') + '
')
- toc.insert(0, u'
Table of Contents
')
- toc.append(u'\n')
+ toc.insert(0, '
Table of Contents
')
+ toc.append('\n')
# split up and insert constructed headlines
@@ -1990,7 +1995,7 @@ def formatHeadings(self, text, isMain):
i = 0
len_blocks = len(blocks)
- forceTocPosition = text.find(u"")
+ forceTocPosition = text.find("")
full = []
while i < len_blocks:
j = i/4
@@ -2002,9 +2007,9 @@ def formatHeadings(self, text, isMain):
full += head[j]
head[j] = None
i += 4
- full = u''.join(full)
+ full = ''.join(full)
if forceTocPosition != -1:
- return full.replace(u"", u''.join(toc), 1)
+ return full.replace("", ''.join(toc), 1)
else:
return full
@@ -2061,18 +2066,18 @@ def to_unicode(text, charset=None):
# two possibilities for storing unicode strings in exception data:
try:
# custom __str__ method on the exception (e.g. PermissionError)
- return unicode(text)
+ return six.text_type(text)
except UnicodeError:
# unicode arguments given to the exception (e.g. parse_date)
return ' '.join([to_unicode(arg) for arg in text.args])
- return unicode(text)
+ return six.text_type(text)
if charset:
- return unicode(text, charset, 'replace')
+ return six.ensure_text(text, charset, 'replace')
else:
try:
- return unicode(text, 'utf-8')
+ return six.ensure_text(text, 'utf-8')
except UnicodeError:
- return unicode(text, locale.getpreferredencoding(), 'replace')
+ return six.ensure_text(text, locale.getpreferredencoding(), 'replace')
# tag hooks
mTagHooks = {}
@@ -2090,12 +2095,12 @@ def to_unicode(text, charset=None):
from cgi import escape
def hook_quote(env, body, attributes={}):
- text = [u'
']
+ text = ['
']
if 'cite' in attributes:
- text.append(u"%s wrote:\n" % escape(attributes['cite']))
+ text.append("%s wrote:\n" % escape(attributes['cite']))
text.append(body.strip())
- text.append(u'
')
- return u'\n'.join(text)
+ text.append('
')
+ return '\n'.join(text)
registerTagHook('quote', hook_quote)
def safe_name(name=None, remove_slashes=True):
@@ -2115,10 +2120,6 @@ def str2url(str):
Takes a UTF-8 string and replaces all characters with the equivalent in 7-bit
ASCII. It returns a plain ASCII string usable in URLs.
"""
- try:
- str = str.encode('utf-8')
- except:
- pass
mfrom = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîï"
to = "AAAAAAECEEEEIIIIDNOOOOOOUUUUYSaaaaaaaceeeeiiii"
mfrom += "ñòóôõöøùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ"
diff --git a/setup.py b/setup.py
index b019b85..b9f0d9f 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,8 @@
to HTML
'''
+from __future__ import unicode_literals
+from __future__ import absolute_import
from setuptools import setup
import mediawiki
@@ -29,6 +31,7 @@
zip_safe=False,
platforms='any',
install_requires=[
+ 'six',
],
classifiers=[
'Development Status :: 4 - Beta',
@@ -37,6 +40,7 @@
'License :: OSI Approved :: GNU General Public License (GPL)',
'Operating System :: OS Independent',
'Programming Language :: Python',
+ 'Programming Language :: Python :: 3',
'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
'Topic :: Software Development :: Libraries :: Python Modules'
],
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..677743a
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,8 @@
+[tox]
+envlist = py{27,37}
+
+[testenv]
+deps =
+ six
+commands =
+ /bin/sh -c 'cd mediawiki/doc && python generate_syntax_demo.py | diff -u syntax.html -'