Skip to content

Commit 1625edb

Browse files
committed
Preprocess and sanitize with html5lib
Still lots more to do, as html5lib's sanitization likes to escape tags instead of dropping them. I ended up fixing up the html5lib docs while working on this: html5lib/html5lib-python#332
1 parent 33acfb9 commit 1625edb

File tree

5 files changed

+288
-15
lines changed

5 files changed

+288
-15
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# is not compatible.
1818
'feedparser == 5.2.1',
1919
'simplejson >= 2.1.0', # for JSONEncoderForHTML
20+
'html5lib == 0.999999999',
2021
],
2122
tests_require=[
2223
'mock',

yarrharr/fetch.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import hashlib
3636

3737
import attr
38+
from django.db import transaction
3839
from django.utils import timezone
3940
import feedparser
4041
try:
@@ -49,11 +50,17 @@
4950
import pytz
5051

5152
from .models import Feed
53+
from .sanitize import html_to_text, sanitize_html
5254

5355

5456
log = Logger()
5557

5658

59+
# Disable feedparser's HTML sanitization, as it drops important information
60+
# (like YouTube embeds). We do our own sanitization with html5lib.
61+
feedparser.SANITIZE_HTML = False
62+
63+
5764
@attr.s(slots=True, frozen=True)
5865
class BadStatus(object):
5966
"""
@@ -110,7 +117,7 @@ class MaybeUpdated(object):
110117
def persist(self, feed):
111118
feed.last_checked = timezone.now()
112119
feed.error = u''
113-
feed.feed_title = self.feed_title
120+
feed.feed_title = html_to_text(self.feed_title)
114121
feed.site_url = self.site_url
115122
feed.etag = self.etag
116123
feed.last_modified = self.last_modified
@@ -140,29 +147,29 @@ def _upsert_article(self, feed, upsert):
140147
read=False,
141148
fave=False,
142149
author=upsert.author,
143-
title=upsert.title,
150+
title=html_to_text(upsert.title),
144151
url=upsert.url,
145152
# Sometimes feeds lack dates on entries (e.g.
146153
# <http://antirez.com/rss>); in this case default to the
147154
# current date so that they get the date the feed was fetched.
148155
date=upsert.date or timezone.now(),
149156
guid=upsert.guid or None,
150157
raw_content=upsert.raw_content,
151-
content=upsert.raw_content,
158+
content=sanitize_html(upsert.raw_content),
152159
)
153160
created.save()
154161
log.debug(" created {created}", created=created)
155162
else:
156163
match.author = upsert.author
157-
match.title = upsert.title
164+
match.title = html_to_text(upsert.title)
158165
match.url = upsert.url
159166
if upsert.date:
160167
# The feed may not give a date. In that case leave the date
161168
# that was assigned when the entry was first discovered.
162169
match.date = upsert.date
163170
match.guid = upsert.guid
164171
match.raw_content = upsert.raw_content
165-
match.content = upsert.raw_content
172+
match.content = sanitize_html(upsert.raw_content)
166173
match.save()
167174
log.debug(" updated {updated}", updated=match)
168175

@@ -228,7 +235,7 @@ def poll(reactor, max_fetch=5):
228235
log.debug("Polled {feed} -> {outcome}", feed=feed, outcome=outcome)
229236
except Exception:
230237
log.failure("Failed to poll {feed}", feed=feed)
231-
outcomes.append((feed, PollError()))
238+
outcomes.append((feed.id, PollError()))
232239

233240
try:
234241
yield deferToThread(persist_outcomes, outcomes)
@@ -332,9 +339,18 @@ def persist_outcomes(outcomes):
332339
This function is called in a thread to update the database after a poll.
333340
334341
:param outcomes:
342+
:class:`list` of (feed_id, outcome) tuples, where each `outcome` is an
343+
object with a ``persist(feed)`` method.
335344
"""
336-
for feed, outcome in outcomes:
337-
outcome.persist(feed)
345+
for feed_id, outcome in outcomes:
346+
with transaction.atomic():
347+
try:
348+
feed = Feed.objects.get(id=feed_id)
349+
except Feed.DoesNotExist:
350+
# The feed was deleted while we were polling it. Discard
351+
# any update as it doesn't matter any more.
352+
continue
353+
outcome.persist(feed)
338354

339355

340356
def schedule(feed):

yarrharr/sanitize.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright © 2017 Tom Most <twm@freecog.net>
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
#
17+
# Additional permission under GNU GPL version 3 section 7
18+
#
19+
# If you modify this Program, or any covered work, by linking or
20+
# combining it with OpenSSL (or a modified version of that library),
21+
# containing parts covered by the terms of the OpenSSL License, the
22+
# licensors of this Program grant you additional permission to convey
23+
# the resulting work. Corresponding Source for a non-source form of
24+
# such a combination shall include the source code for the parts of
25+
# OpenSSL used as well as that of the covered work.
26+
27+
import html5lib
28+
from html5lib.constants import namespaces
29+
from html5lib.filters.base import Filter as BaseFilter
30+
31+
STYLE_TAG = '{http://www.w3.org/1999/xhtml}style'
32+
SCRIPT_TAG = '{http://www.w3.org/1999/xhtml}script'
33+
OBJECT_TAG = '{http://www.w3.org/1999/xhtml}object'
34+
35+
36+
def html_to_text(html):
37+
"""
38+
Extract the text from the given HTML fragment.
39+
"""
40+
tree = html5lib.parseFragment(html)
41+
bits = []
42+
43+
def visit(el):
44+
if el.tag != STYLE_TAG and el.tag != SCRIPT_TAG:
45+
if el.text is not None:
46+
bits.append(el.text)
47+
for child in el:
48+
visit(child)
49+
if el.tail is not None:
50+
bits.append(el.tail)
51+
52+
visit(tree)
53+
return u''.join(bits)
54+
55+
56+
def sanitize_html(html):
57+
"""
58+
Make the given HTML string safe to display in a Yarrharr page.
59+
"""
60+
tree = html5lib.parseFragment(html)
61+
w = html5lib.getTreeWalker('etree')
62+
s = html5lib.serializer.HTMLSerializer(sanitize=True)
63+
return s.render(_ElideFilter(_ReplaceObjectFilter(w(tree))))
64+
65+
66+
class _ElideFilter(BaseFilter):
67+
"""
68+
``<script>`` and ``<style>`` tags are dropped entirely, including their
69+
content.
70+
"""
71+
_elide_tags = frozenset((
72+
(namespaces['html'], 'script'),
73+
(namespaces['html'], 'style'),
74+
))
75+
76+
def __iter__(self):
77+
elide = 0
78+
elide_ns = None
79+
elide_name = None
80+
for token in BaseFilter.__iter__(self):
81+
token_type = token['type']
82+
if elide:
83+
if token_type == 'EndTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
84+
elide -= 1
85+
if token_type == 'StartTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
86+
elide += 1
87+
continue # Drop the token
88+
else:
89+
if token_type == 'StartTag':
90+
if (token['namespace'], token['name']) in self._elide_tags:
91+
elide += 1
92+
elide_name = token['name']
93+
elide_ns = token['namespace']
94+
continue # Drop this token.
95+
yield token
96+
97+
98+
class _ReplaceObjectFilter(BaseFilter):
99+
"""
100+
``<object>`` tags are replaced with their content.
101+
"""
102+
def __iter__(self):
103+
html_ns = namespaces['html']
104+
nest = 0
105+
for token in BaseFilter.__iter__(self):
106+
token_type = token['type']
107+
# Drop <param> when inside <object>. We don't handle nesting
108+
# properly, but they're not valid anywhere else so that's not
109+
# a problem.
110+
if nest >= 1 and token_type == 'EmptyTag' and token['name'] == 'param' and token['namespace'] == html_ns:
111+
continue
112+
113+
if token_type == 'EndTag' and token['name'] == 'object' and token['namespace'] == html_ns:
114+
nest -= 1
115+
continue
116+
117+
if token_type == 'StartTag' and token['name'] == 'object' and token['namespace'] == html_ns:
118+
nest += 1
119+
continue
120+
121+
yield token

yarrharr/tests/test_fetch.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ def test_persist_new_article(self):
387387
title=u'Blah Blah',
388388
url=u'https://example.com/blah-blah',
389389
raw_content=u'<p>Hello, world!</p>',
390-
content=u'<p>Hello, world!</p>',
390+
content=u'<p>Hello, world!',
391391
)
392392

393393
def test_persist_article_lacking_date(self):
@@ -466,24 +466,25 @@ def test_persist_article_guid_match(self):
466466
title=u'Blah Blah',
467467
url=u'https://example.com/blah-blah',
468468
raw_content=u'<p>Hello, world!</p>',
469-
content=u'<p>Hello, world!</p>',
469+
content=u'<p>Hello, world!',
470470
)
471471

472472
def test_persist_article_sanitize(self):
473473
"""
474474
The HTML associated with an article is sanitized when it is persisted.
475475
"""
476476
mu = MaybeUpdated(
477-
feed_title=u'Example',
477+
feed_title=u'<b>Example</b>',
478478
site_url=u'https://example.com/',
479479
articles=[
480480
ArticleUpsert(
481481
author=u'Joe Bloggs',
482-
title=u'Blah Blah',
482+
title=u'Blah <i>Blah</i>',
483483
url=u'https://example.com/blah-blah',
484484
date=timezone.now(),
485485
guid=u'49e3c525-724c-44d8-ad0c-d78bd216d003',
486-
raw_content=u'<p>Hello, world!',
486+
raw_content=u'<p>Hello, <style>...</style>world'
487+
u'<script type="text/javascript">alert("lololol")</script>!',
487488
),
488489
],
489490
etag=b'"etag"',
@@ -493,9 +494,12 @@ def test_persist_article_sanitize(self):
493494

494495
mu.persist(self.feed)
495496

497+
self.assertEqual(u'Example', self.feed.title)
496498
[article] = self.feed.articles.all()
497499
self.assertFields(
498500
article,
499-
raw_content=u'<p>Hello, world!',
500-
content=u'<p>Hello, world!</p>',
501+
title=u'Blah Blah',
502+
raw_content=u'<p>Hello, <style>...</style>world'
503+
u'<script type="text/javascript">alert("lololol")</script>!',
504+
content=u'<p>Hello, world!',
501505
)

0 commit comments

Comments
 (0)