Preprocess and sanitize with html5lib

twm · twm · commit 1625edbe5369 · 2017-04-15T15:51:45.000-07:00
Still lots more to do, as html5lib's sanitization likes to escape tags instead of dropping them. I ended up fixing up the html5lib docs while working on this: html5lib/html5lib-python#332
diff --git a/setup.py b/setup.py
@@ -17,6 +17,7 @@
         # is not compatible.
         'feedparser == 5.2.1',
         'simplejson >= 2.1.0',  # for JSONEncoderForHTML
+        'html5lib == 0.999999999',
     ],
     tests_require=[
         'mock',
diff --git a/yarrharr/fetch.py b/yarrharr/fetch.py
@@ -35,6 +35,7 @@
 import hashlib
 
 import attr
+from django.db import transaction
 from django.utils import timezone
 import feedparser
 try:
@@ -49,11 +50,17 @@
 import pytz
 
 from .models import Feed
+from .sanitize import html_to_text, sanitize_html
 
 
 log = Logger()
 
 
+# Disable feedparser's HTML sanitization, as it drops important information
+# (like YouTube embeds). We do our own sanitization with html5lib.
+feedparser.SANITIZE_HTML = False
+
+
 @attr.s(slots=True, frozen=True)
 class BadStatus(object):
     """
@@ -110,7 +117,7 @@ class MaybeUpdated(object):
     def persist(self, feed):
         feed.last_checked = timezone.now()
         feed.error = u''
-        feed.feed_title = self.feed_title
+        feed.feed_title = html_to_text(self.feed_title)
         feed.site_url = self.site_url
         feed.etag = self.etag
         feed.last_modified = self.last_modified
@@ -140,29 +147,29 @@ def _upsert_article(self, feed, upsert):
                 read=False,
                 fave=False,
                 author=upsert.author,
-                title=upsert.title,
+                title=html_to_text(upsert.title),
                 url=upsert.url,
                 # Sometimes feeds lack dates on entries (e.g.
                 # <http://antirez.com/rss>); in this case default to the
                 # current date so that they get the date the feed was fetched.
                 date=upsert.date or timezone.now(),
                 guid=upsert.guid or None,
                 raw_content=upsert.raw_content,
-                content=upsert.raw_content,
+                content=sanitize_html(upsert.raw_content),
             )
             created.save()
             log.debug("  created {created}", created=created)
         else:
             match.author = upsert.author
-            match.title = upsert.title
+            match.title = html_to_text(upsert.title)
             match.url = upsert.url
             if upsert.date:
                 # The feed may not give a date. In that case leave the date
                 # that was assigned when the entry was first discovered.
                 match.date = upsert.date
             match.guid = upsert.guid
             match.raw_content = upsert.raw_content
-            match.content = upsert.raw_content
+            match.content = sanitize_html(upsert.raw_content)
             match.save()
             log.debug("  updated {updated}", updated=match)
 
@@ -228,7 +235,7 @@ def poll(reactor, max_fetch=5):
             log.debug("Polled {feed} -> {outcome}", feed=feed, outcome=outcome)
         except Exception:
             log.failure("Failed to poll {feed}", feed=feed)
-            outcomes.append((feed, PollError()))
+            outcomes.append((feed.id, PollError()))
 
     try:
         yield deferToThread(persist_outcomes, outcomes)
@@ -332,9 +339,18 @@ def persist_outcomes(outcomes):
     This function is called in a thread to update the database after a poll.
 
     :param outcomes:
+        :class:`list` of (feed_id, outcome) tuples, where each `outcome` is an
+        object with a ``persist(feed)`` method.
     """
-    for feed, outcome in outcomes:
-        outcome.persist(feed)
+    for feed_id, outcome in outcomes:
+        with transaction.atomic():
+            try:
+                feed = Feed.objects.get(id=feed_id)
+            except Feed.DoesNotExist:
+                # The feed was deleted while we were polling it. Discard
+                # any update as it doesn't matter any more.
+                continue
+            outcome.persist(feed)
 
 
 def schedule(feed):
diff --git a/yarrharr/sanitize.py b/yarrharr/sanitize.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+# Copyright © 2017 Tom Most <twm@freecog.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Additional permission under GNU GPL version 3 section 7
+#
+# If you modify this Program, or any covered work, by linking or
+# combining it with OpenSSL (or a modified version of that library),
+# containing parts covered by the terms of the OpenSSL License, the
+# licensors of this Program grant you additional permission to convey
+# the resulting work.  Corresponding Source for a non-source form of
+# such a combination shall include the source code for the parts of
+# OpenSSL used as well as that of the covered work.
+
+import html5lib
+from html5lib.constants import namespaces
+from html5lib.filters.base import Filter as BaseFilter
+
+STYLE_TAG = '{http://www.w3.org/1999/xhtml}style'
+SCRIPT_TAG = '{http://www.w3.org/1999/xhtml}script'
+OBJECT_TAG = '{http://www.w3.org/1999/xhtml}object'
+
+
+def html_to_text(html):
+    """
+    Extract the text from the given HTML fragment.
+    """
+    tree = html5lib.parseFragment(html)
+    bits = []
+
+    def visit(el):
+        if el.tag != STYLE_TAG and el.tag != SCRIPT_TAG:
+            if el.text is not None:
+                bits.append(el.text)
+            for child in el:
+                visit(child)
+        if el.tail is not None:
+            bits.append(el.tail)
+
+    visit(tree)
+    return u''.join(bits)
+
+
+def sanitize_html(html):
+    """
+    Make the given HTML string safe to display in a Yarrharr page.
+    """
+    tree = html5lib.parseFragment(html)
+    w = html5lib.getTreeWalker('etree')
+    s = html5lib.serializer.HTMLSerializer(sanitize=True)
+    return s.render(_ElideFilter(_ReplaceObjectFilter(w(tree))))
+
+
+class _ElideFilter(BaseFilter):
+    """
+    ``<script>`` and ``<style>`` tags are dropped entirely, including their
+    content.
+    """
+    _elide_tags = frozenset((
+        (namespaces['html'], 'script'),
+        (namespaces['html'], 'style'),
+    ))
+
+    def __iter__(self):
+        elide = 0
+        elide_ns = None
+        elide_name = None
+        for token in BaseFilter.__iter__(self):
+            token_type = token['type']
+            if elide:
+                if token_type == 'EndTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
+                    elide -= 1
+                if token_type == 'StartTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
+                    elide += 1
+                continue  # Drop the token
+            else:
+                if token_type == 'StartTag':
+                    if (token['namespace'], token['name']) in self._elide_tags:
+                        elide += 1
+                        elide_name = token['name']
+                        elide_ns = token['namespace']
+                        continue  # Drop this token.
+                yield token
+
+
+class _ReplaceObjectFilter(BaseFilter):
+    """
+    ``<object>`` tags are replaced with their content.
+    """
+    def __iter__(self):
+        html_ns = namespaces['html']
+        nest = 0
+        for token in BaseFilter.__iter__(self):
+            token_type = token['type']
+            # Drop <param> when inside <object>. We don't handle nesting
+            # properly, but they're not valid anywhere else so that's not
+            # a problem.
+            if nest >= 1 and token_type == 'EmptyTag' and token['name'] == 'param' and token['namespace'] == html_ns:
+                continue
+
+            if token_type == 'EndTag' and token['name'] == 'object' and token['namespace'] == html_ns:
+                nest -= 1
+                continue
+
+            if token_type == 'StartTag' and token['name'] == 'object' and token['namespace'] == html_ns:
+                nest += 1
+                continue
+
+            yield token
diff --git a/yarrharr/tests/test_fetch.py b/yarrharr/tests/test_fetch.py
@@ -387,7 +387,7 @@ def test_persist_new_article(self):
             title=u'Blah Blah',
             url=u'https://example.com/blah-blah',
             raw_content=u'<p>Hello, world!</p>',
-            content=u'<p>Hello, world!</p>',
+            content=u'<p>Hello, world!',
         )
 
     def test_persist_article_lacking_date(self):
@@ -466,24 +466,25 @@ def test_persist_article_guid_match(self):
             title=u'Blah Blah',
             url=u'https://example.com/blah-blah',
             raw_content=u'<p>Hello, world!</p>',
-            content=u'<p>Hello, world!</p>',
+            content=u'<p>Hello, world!',
         )
 
     def test_persist_article_sanitize(self):
         """
         The HTML associated with an article is sanitized when it is persisted.
         """
         mu = MaybeUpdated(
-            feed_title=u'Example',
+            feed_title=u'<b>Example</b>',
             site_url=u'https://example.com/',
             articles=[
                 ArticleUpsert(
                     author=u'Joe Bloggs',
-                    title=u'Blah Blah',
+                    title=u'Blah <i>Blah</i>',
                     url=u'https://example.com/blah-blah',
                     date=timezone.now(),
                     guid=u'49e3c525-724c-44d8-ad0c-d78bd216d003',
-                    raw_content=u'<p>Hello, world!',
+                    raw_content=u'<p>Hello, <style>...</style>world'
+                                u'<script type="text/javascript">alert("lololol")</script>!',
                 ),
             ],
             etag=b'"etag"',
@@ -493,9 +494,12 @@ def test_persist_article_sanitize(self):
 
         mu.persist(self.feed)
 
+        self.assertEqual(u'Example', self.feed.title)
         [article] = self.feed.articles.all()
         self.assertFields(
             article,
-            raw_content=u'<p>Hello, world!',
-            content=u'<p>Hello, world!</p>',
+            title=u'Blah Blah',
+            raw_content=u'<p>Hello, <style>...</style>world'
+                        u'<script type="text/javascript">alert("lololol")</script>!',
+            content=u'<p>Hello, world!',
         )
diff --git a/yarrharr/tests/test_sanitize.py b/yarrharr/tests/test_sanitize.py