From 1aa8a08a968ecc7c103fbc7cb7b821706b7c98a5 Mon Sep 17 00:00:00 2001 From: mszurap Date: Wed, 13 Jun 2012 21:44:00 +0200 Subject: [PATCH] Format additional special characters to unicode characters. --- util.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/util.py b/util.py index 61ff2a5..c81d1e2 100644 --- a/util.py +++ b/util.py @@ -241,7 +241,16 @@ def func(match): except Exception: return match.group(0) return entity.sub(func, text) - + +def replace_entities3(text): + entity = re.compile(r'&#x([abcdefABCDEF0-9]{2});') + def func(match): + try: + return unichr(int('0x'+match.group(1),0)) + except Exception: + return match.group(0) + return entity.sub(func, text) + def remove_markup(text): html = re.compile(r'<[^>]+>') return html.sub(' ', text) @@ -250,8 +259,10 @@ def format(text, max_length=400): previous = '' while text != previous: previous = text + text = text.replace('&', '&') text = replace_entities1(text) text = replace_entities2(text) + text = replace_entities3(text) text = remove_markup(text) text = ' '.join(text.split()) if len(text) > max_length: