From d93d77c8c7ef9deb022b5cdaff2abcf7a26aa6af Mon Sep 17 00:00:00 2001
From: Martin Zimmermann <info@posativ.org>
Date: Tue, 18 Feb 2014 16:52:26 +0100
Subject: [PATCH] refactor markup and sanitization code

This commit introduces a new configuration section [markup] to refine
Misaka's Markdown extensions (by default strikethrough, superscript and
autolink).

Furthermore, you can set custom HTML elements/attributes that are
allowed, e.g. to enable images, set

  [markup]
  allowed-elements = img
  allowed-attributes = src

The refactorization separates HTML sanitization from Markdown -> HTML
and allows to include new markup languages such as BB Code or
reStructuredText.
---
 isso/__init__.py       |  6 ++-
 isso/compat.py         |  2 +
 isso/core.py           |  6 ++-
 isso/utils/html.py     | 86 +++++++++++++++++++++++-------------------
 isso/views/comments.py |  8 ++--
 5 files changed, 64 insertions(+), 44 deletions(-)
diff --git a/isso/__init__.py b/isso/__init__.py
index c77b07e..6d48af4 100644
--- a/isso/__init__.py
+++ b/isso/__init__.py
@@ -64,7 +64,7 @@ local_manager = LocalManager([local])
 
 from isso import db, migrate, wsgi, ext, views
 from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config
-from isso.utils import parse, http, JSONRequest, origin
+from isso.utils import parse, http, JSONRequest, origin, html
 from isso.views import comments
 
 from isso.ext.notifications import Stdout, SMTP
@@ -86,6 +86,7 @@ class Isso(object):
         self.conf = conf
         self.db = db.SQLite3(conf.get('general', 'dbpath'), conf)
         self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key'))
+        self.markup = html.Markup(conf.section('markup'))
 
         super(Isso, self).__init__(conf)
 
@@ -102,6 +103,9 @@ class Isso(object):
         views.Info(self)
         comments.API(self)
 
+    def render(self, text):
+        return self.markup.render(text)
+
     def sign(self, obj):
         return self.signer.dumps(obj)
 
diff --git a/isso/compat.py b/isso/compat.py
index 772d561..ac09826 100644
--- a/isso/compat.py
+++ b/isso/compat.py
@@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2
 if not PY2K:
 
     map, zip, filter = map, zip, filter
+    from functools import reduce
 
     text_type = str
     string_types = (str, )
@@ -15,6 +16,7 @@ else:
 
     from itertools import imap, izip, ifilter
     map, zip, filter = imap, izip, ifilter
+    reduce = reduce
 
     text_type = unicode
     string_types = (str, unicode)
diff --git a/isso/core.py b/isso/core.py
index bb0c1eb..ce2abf2 100644
--- a/isso/core.py
+++ b/isso/core.py
@@ -132,7 +132,11 @@ class Config:
         "enabled = true",
         "ratelimit = 2",
         "direct-reply = 3",
-        "reply-to-self = false"
+        "reply-to-self = false",
+        "[markup]",
+        "options = strikethrough, superscript, autolink",
+        "allowed-elements = ",
+        "allowed-attributes = "
     ]
 
     @classmethod
diff --git a/isso/utils/html.py b/isso/utils/html.py
index 34a4dc7..5650bfd 100644
--- a/isso/utils/html.py
+++ b/isso/utils/html.py
@@ -1,9 +1,12 @@
 # -*- encoding: utf-8 -*-
 
 import pkg_resources
+import operator
+
+from isso.compat import reduce
 
 import html5lib
-setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version)
+html5lib_version = pkg_resources.get_distribution("html5lib").version
 
 from html5lib.sanitizer import HTMLSanitizer
 from html5lib.serializer import HTMLSerializer
@@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker
 import misaka
 
 
-class MarkdownSanitizer(HTMLSanitizer):
+def Sanitizer(elements, attributes):
 
-    # attributes found in Sundown's HTML serializer [1] except for <img> tag,
-    # because images are not generated anyways.
-    #
-    # [1] https://github.com/vmg/sundown/blob/master/html/html.c
-    allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
-                        "pre", "code", "blockquote",
-                        "del", "ins", "strong", "em",
-                        "h1", "h2", "h3", "h4", "h5", "h6",
-                        "table", "thead", "tbody", "th", "td"]
+    class Inner(HTMLSanitizer):
 
-    # href for <a> and align for <table>
-    allowed_attributes = ["align", "href"]
+        # attributes found in Sundown's HTML serializer [1] except for <img> tag,
+        # because images are not generated anyways.
+        #
+        # [1] https://github.com/vmg/sundown/blob/master/html/html.c
+        allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
+                            "pre", "code", "blockquote",
+                            "del", "ins", "strong", "em",
+                            "h1", "h2", "h3", "h4", "h5", "h6",
+                            "table", "thead", "tbody", "th", "td"] + elements
 
-    # remove disallowed tokens from the output
-    def disallowed_token(self, token, token_type):
-        return None
+        # href for <a> and align for <table>
+        allowed_attributes = ["align", "href"] + attributes
+
+        # remove disallowed tokens from the output
+        def disallowed_token(self, token, token_type):
+            return None
+
+    return Inner
 
 
-def sanitize(document):
+def sanitize(tokenizer, document):
 
-    parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
+    parser = html5lib.HTMLParser(tokenizer=tokenizer)
     domtree = parser.parseFragment(document)
 
-    builder = "simpletree" if html5lib.version == "0.95" else "etree"
+    builder = "simpletree" if html5lib_version == "0.95" else "etree"
     stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
     serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
 
     return serializer.render(stream)
 
 
-def markdown(text):
-    """Convert Markdown to (safe) HTML.
+def Markdown(extensions=("strikethrough", "superscript", "autolink")):
 
-    >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
-    '<p><em>Ohai!</em></p>'
-    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
-    '<p><em>Hi</em></p>'
-    >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
-    "<p>alert('Onoe')</p>"
-    >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
-    '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
-    """
+    flags = reduce(operator.xor, map(
+        lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)
 
-    # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
-    exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
+    def inner(text):
+        rv = misaka.html(text, extensions=flags).rstrip("\n")
+        if not rv.endswith("<p>") and not rv.endswith("</p>"):
+            return "<p>" + rv + "</p>"
+        return rv
 
-    # remove HTML tags, skip <img> (for now) and only render "safe" protocols
-    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+    return inner
 
-    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
-    if not rv.startswith("<p>") and not rv.endswith("</p>"):
-        rv = "<p>" + rv + "</p>"
 
-    return sanitize(rv)
+class Markup(object):
+
+    def __init__(self, conf):
+
+        parser = Markdown(conf.getlist("options"))
+        sanitizer = Sanitizer(
+            conf.getlist("allowed-elements"),
+            conf.getlist("allowed-attributes"))
+
+        self._render = lambda text: sanitize(sanitizer, parser(text))
+
+    def render(self, text):
+        return self._render(text)
diff --git a/isso/views/comments.py b/isso/views/comments.py
index 1262170..c231ead 100644
--- a/isso/views/comments.py
+++ b/isso/views/comments.py
@@ -163,7 +163,7 @@ class API(object):
             value=self.isso.sign([rv["id"], sha1(rv["text"])]),
             max_age=self.conf.getint('max-age'))
 
-        rv["text"] = html.markdown(rv["text"])
+        rv["text"] = self.isso.render(rv["text"])
         rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
 
         self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@@ -189,7 +189,7 @@ class API(object):
             rv.pop(key)
 
         if request.args.get('plain', '0') == '0':
-            rv['text'] = html.markdown(rv['text'])
+            rv['text'] = self.isso.render(rv['text'])
 
         return JSON(rv, 200)
 
@@ -230,7 +230,7 @@ class API(object):
                 value=self.isso.sign([rv["id"], sha1(rv["text"])]),
                 max_age=self.conf.getint('max-age'))
 
-        rv["text"] = html.markdown(rv["text"])
+        rv["text"] = self.isso.render(rv["text"])
 
         resp = JSON(rv, 200)
         resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@@ -336,7 +336,7 @@ class API(object):
 
         if request.args.get('plain', '0') == '0':
             for item in rv:
-                item['text'] = html.markdown(item['text'])
+                item['text'] = self.isso.render(item['text'])
 
         return JSON(rv, 200)