From 3713d5e8eea8006c2b912810af40d2045e1f0eeb Mon Sep 17 00:00:00 2001
From: Martin Zimmermann <info@posativ.org>
Date: Sun, 12 Jan 2014 12:52:33 +0100
Subject: [PATCH] allow raw HTML markup for a few (whitelisted) tags

To be compatible with comments from Disqus (and users unfamiliar with
Markdown), Misaka no longer disables user-inputted HTML, but the
generated HTML is now post-processed and all "unsafe" tags (not
possible with Markdown) are discarded.

Whitelist: p, a, pre, blockquote, h1-h6, em, sub, sup, del, ins, math,
           dl, ol, ul, li

This commit also removes an unnecessary newline generated by
Misaka/Sundown.
---
 isso/utils/__init__.py | 73 ++++++++++++++++++++++++++++++++++++++++--
 specs/test_comments.py | 10 +++---
 2 files changed, 75 insertions(+), 8 deletions(-)
diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py
index a8327c3..5d4ec7f 100644
--- a/isso/utils/__init__.py
+++ b/isso/utils/__init__.py
@@ -5,9 +5,16 @@ from __future__ import division
 import pkg_resources
 werkzeug = pkg_resources.get_distribution("werkzeug")
 
+import io
 import json
 import hashlib
 
+try:
+    from html.parser import HTMLParser, HTMLParseError
+except ImportError:
+    from HTMLParser import HTMLParser, HTMLParseError
+
+from werkzeug.utils import escape
 from werkzeug.wrappers import Request, Response
 from werkzeug.exceptions import BadRequest
 
@@ -120,13 +127,69 @@ class JSONResponse(Response):
             json.dumps(obj).encode("utf-8"), *args, **kwargs)
 
 
+class Sanitizer(HTMLParser, object):
+    """Sanitize HTML output: remove unsafe HTML tags such as iframe or
+    script based on a whitelist of allowed tags."""
+
+    safe = set([
+        "p", "a", "pre", "blockquote",
+        "h1", "h2", "h3", "h4", "h5", "h6",
+        "em", "sub", "sup", "del", "ins", "math",
+        "dl", "ol", "ul", "li"])
+
+    @classmethod
+    def format(cls, attrs):
+        res = []
+        for key, value in attrs:
+            if value is None:
+                res.append(key)
+            else:
+                res.append(u'{0}="{1}"'.format(key, escape(value)))
+        return ' '.join(res)
+
+    def __init__(self, html):
+        super(Sanitizer, self).__init__()
+        self.result = io.StringIO()
+        self.feed(html)
+        self.result.seek(0)
+
+    def handle_starttag(self, tag, attrs):
+        if tag in Sanitizer.safe:
+            self.result.write(u"<" + tag)
+            if attrs:
+                self.result.write(" " + Sanitizer.format(attrs))
+            self.result.write(u">")
+
+    def handle_data(self, data):
+        self.result.write(data)
+
+    def handle_endtag(self, tag):
+        if tag in Sanitizer.safe:
+            self.result.write(u"</" + tag + ">")
+
+    def handle_startendtag(self, tag, attrs):
+        if tag in Sanitizer.safe:
+            self.result.write(u"<" + tag)
+            if attrs:
+                self.result.write(" " + Sanitizer.format(attrs))
+            self.result.write(u"/>")
+
+    def handle_entityref(self, name):
+        self.result.write(u'&' + name + ';')
+
+    def handle_charref(self, char):
+        self.result.write(u'&#' + char + ';')
+
+
 def markdown(text):
     """Convert Markdown to (safe) HTML.
 
     >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
     '<p><em>Ohai!</em></p>'
+    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
+    '<p><em>Hi</em></p>'
     >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
-    '<p>alert(&#39;Onoe&#39;)</p>'
+    "<p>alert('Onoe')</p>"
     >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
     '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
     """
@@ -135,9 +198,13 @@ def markdown(text):
     exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
 
     # remove HTML tags, skip <img> (for now) and only render "safe" protocols
-    html = misaka.HTML_SKIP_HTML | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+
+    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
+    if not rv.startswith("<p>") and not rv.endswith("</p>"):
+        rv = "<p>" + rv + "</p>"
 
-    return misaka.html(text, extensions=exts, render_flags=html).strip("\n")
+    return Sanitizer(rv).result.read()
 
 
 def origin(hosts):
diff --git a/specs/test_comments.py b/specs/test_comments.py
index 10704d8..7e19911 100644
--- a/specs/test_comments.py
+++ b/specs/test_comments.py
@@ -54,7 +54,7 @@ class TestComments(unittest.TestCase):
         rv = loads(r.data)
 
         assert rv['id'] == 1
-        assert rv['text'] == '<p>Lorem ipsum ...</p>\n'
+        assert rv['text'] == '<p>Lorem ipsum ...</p>'
 
     def testCreate(self):
 
@@ -66,7 +66,7 @@ class TestComments(unittest.TestCase):
         rv = loads(rv.data)
 
         assert rv["mode"] == 1
-        assert rv["text"] == '<p>Lorem ipsum ...</p>\n'
+        assert rv["text"] == '<p>Lorem ipsum ...</p>'
 
     def textCreateWithNonAsciiText(self):
 
@@ -78,7 +78,7 @@ class TestComments(unittest.TestCase):
         rv = loads(rv.data)
 
         assert rv["mode"] == 1
-        assert rv["text"] == '<p>Здравствуй, мир!</p>\n'
+        assert rv["text"] == '<p>Здравствуй, мир!</p>'
 
     def testCreateMultiple(self):
 
@@ -262,10 +262,10 @@ class TestComments(unittest.TestCase):
         self.post('/new?uri=test', data=json.dumps({"text": "Tpyo"}))
 
         self.put('/id/1', data=json.dumps({"text": "Tyop"}))
-        assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>\n"
+        assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>"
 
         self.put('/id/1', data=json.dumps({"text": "Typo"}))
-        assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>\n"
+        assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>"
 
     def testDeleteCommentRemovesThread(self):