use html5lib's sanitizer, supersedes 3713d5e

Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags.
2014-01-13 18:14:29 +01:00 · 2014-01-13 18:14:29 +01:00 · 3a1f92b8bd
commit 3a1f92b8bd
parent 3713d5e8ee
3 changed files with 71 additions and 88 deletions
--- a/isso/utils/init.py
+++ b/isso/utils/init.py
@ -14,7 +14,6 @@ try:
 except ImportError:
    from HTMLParser import HTMLParser, HTMLParseError

-from werkzeug.utils import escape
 from werkzeug.wrappers import Request, Response
 from werkzeug.exceptions import BadRequest

@ -23,8 +22,6 @@ try:
 except ImportError:
    import ipaddr as ipaddress

-import misaka
-

 def anonymize(remote_addr):
    """
@ -127,86 +124,6 @@ class JSONResponse(Response):
            json.dumps(obj).encode("utf-8"), *args, **kwargs)


-class Sanitizer(HTMLParser, object):
-    """Sanitize HTML output: remove unsafe HTML tags such as iframe or
-    script based on a whitelist of allowed tags."""
-
-    safe = set([
-        "p", "a", "pre", "blockquote",
-        "h1", "h2", "h3", "h4", "h5", "h6",
-        "em", "sub", "sup", "del", "ins", "math",
-        "dl", "ol", "ul", "li"])
-
-    @classmethod
-    def format(cls, attrs):
-        res = []
-        for key, value in attrs:
-            if value is None:
-                res.append(key)
-            else:
-                res.append(u'{0}="{1}"'.format(key, escape(value)))
-        return ' '.join(res)
-
-    def __init__(self, html):
-        super(Sanitizer, self).__init__()
-        self.result = io.StringIO()
-        self.feed(html)
-        self.result.seek(0)
-
-    def handle_starttag(self, tag, attrs):
-        if tag in Sanitizer.safe:
-            self.result.write(u"<" + tag)
-            if attrs:
-                self.result.write(" " + Sanitizer.format(attrs))
-            self.result.write(u">")
-
-    def handle_data(self, data):
-        self.result.write(data)
-
-    def handle_endtag(self, tag):
-        if tag in Sanitizer.safe:
-            self.result.write(u"</" + tag + ">")
-
-    def handle_startendtag(self, tag, attrs):
-        if tag in Sanitizer.safe:
-            self.result.write(u"<" + tag)
-            if attrs:
-                self.result.write(" " + Sanitizer.format(attrs))
-            self.result.write(u"/>")
-
-    def handle_entityref(self, name):
-        self.result.write(u'&' + name + ';')
-
-    def handle_charref(self, char):
-        self.result.write(u'&#' + char + ';')
-
-
-def markdown(text):
-    """Convert Markdown to (safe) HTML.
-
-    >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
-    '<p><em>Ohai!</em></p>'
-    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
-    '<p><em>Hi</em></p>'
-    >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
-    "<p>alert('Onoe')</p>"
-    >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
-    '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
-    """
-
-    # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
-    exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
-
-    # remove HTML tags, skip <img> (for now) and only render "safe" protocols
-    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
-
-    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
-    if not rv.startswith("<p>") and not rv.endswith("</p>"):
-        rv = "<p>" + rv + "</p>"
-
-    return Sanitizer(rv).result.read()
-
-
 def origin(hosts):

    hosts = [x.rstrip("/") for x in hosts]
--- a/isso/utils/html.py
+++ b/isso/utils/html.py
@ -0,0 +1,66 @@
+# -*- encoding: utf-8 -*-
+
+import html5lib
+
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer import HTMLSerializer
+from html5lib.treewalkers import getTreeWalker
+
+import misaka
+
+
+class MarkdownSanitizer(HTMLSanitizer):
+
+    # attributes found in Sundown's HTML serializer [1] except for <img> tag,
+    # because images are not generated anyways.
+    #
+    # [1] https://github.com/vmg/sundown/blob/master/html/html.c
+    allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
+                        "pre", "code", "blockquote",
+                        "del", "ins", "strong", "em",
+                        "h1", "h2", "h3", "h4", "h5", "h6",
+                        "table", "thead", "tbody", "th", "td"]
+
+    # href for <a> and align for <table>
+    allowed_attributes = ["align", "href"]
+
+    # remove disallowed tokens from the output
+    def disallowed_token(self, token, token_type):
+        return None
+
+
+def sanitize(document):
+
+    parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
+    domtree = parser.parseFragment(document)
+
+    stream = html5lib.treewalkers.getTreeWalker('etree')(domtree)
+    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
+
+    return serializer.render(stream)
+
+
+def markdown(text):
+    """Convert Markdown to (safe) HTML.
+
+    >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
+    '<p><em>Ohai!</em></p>'
+    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
+    '<p><em>Hi</em></p>'
+    >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
+    "<p>alert('Onoe')</p>"
+    >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
+    '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
+    """
+
+    # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
+    exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
+
+    # remove HTML tags, skip <img> (for now) and only render "safe" protocols
+    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+
+    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
+    if not rv.startswith("<p>") and not rv.endswith("</p>"):
+        rv = "<p>" + rv + "</p>"
+
+    return sanitize(rv)
--- a/isso/views/comments.py
+++ b/isso/views/comments.py
@ -16,7 +16,7 @@ from werkzeug.exceptions import BadRequest, Forbidden, NotFound
 from isso.compat import text_type as str

 from isso import utils, local
-from isso.utils import http, parse, markdown, JSONResponse as JSON
+from isso.utils import http, parse, html, JSONResponse as JSON
 from isso.utils.crypto import pbkdf2
 from isso.views import requires

@ -163,7 +163,7 @@ class API(object):
            value=self.isso.sign([rv["id"], sha1(rv["text"])]),
            max_age=self.conf.getint('max-age'))

-        rv["text"] = markdown(rv["text"])
+        rv["text"] = html.markdown(rv["text"])
        rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")

        self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@ -189,7 +189,7 @@ class API(object):
            rv.pop(key)

        if request.args.get('plain', '0') == '0':
-            rv['text'] = markdown(rv['text'])
+            rv['text'] = html.markdown(rv['text'])

        return JSON(rv, 200)

@ -230,7 +230,7 @@ class API(object):
                value=self.isso.sign([rv["id"], sha1(rv["text"])]),
                max_age=self.conf.getint('max-age'))

-        rv["text"] = markdown(rv["text"])
+        rv["text"] = html.markdown(rv["text"])

        resp = JSON(rv, 200)
        resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@ -336,7 +336,7 @@ class API(object):

        if request.args.get('plain', '0') == '0':
            for item in rv:
-                item['text'] = markdown(item['text'])
+                item['text'] = html.markdown(item['text'])

        return JSON(rv, 200)