Merge branch 'feature/configurable-markdown', closes #62

2014-02-18 17:36:09 +01:00 · 2014-02-18 17:36:09 +01:00 · 9272e7390f
commit 9272e7390f
parent 1b0a74e188 c6214e31d7
9 changed files with 187 additions and 44 deletions
--- a/docs/_static/css/site.scss
+++ b/docs/_static/css/site.scss
@ -350,6 +350,10 @@ main {
    margin-left: 1.2em;
  }

+  dl {
+    margin-bottom: 0.4em;
+  }
+
  .admonition {

    p + p {
--- a/docs/docs/configuration/server.rst
+++ b/docs/docs/configuration/server.rst
@ -221,6 +221,37 @@ reply-to-self

    Do not forget to configure the client.

+Markup
+------
+
+Customize markup and sanitized HTML. Currently, only Markdown (via Misaka) is
+supported, but new languages are relatively easy to add.
+
+.. code-block:: ini
+
+    [markup]
+    options = strikethrough, superscript, autolink
+    allowed-elements =
+    allowed-attributes =
+
+options
+    `Misaka-specific Markdown extensions <http://misaka.61924.nl/api/>`_, all
+    flags starting with `EXT_` can be used there, separated by comma.
+
+allowed-elements
+    Additional HTML tags to allow in the generated output, comma-separated. By
+    default, only *a*, *blockquote*, *br*, *code*, *del*, *em*, *h1*, *h2*,
+    *h3*, *h4*, *h5*, *h6*, *hr*, *ins*, *li*, *ol*, *p*, *pre*, *strong*,
+    *table*, *tbody*, *td*, *th*, *thead* and *ul* are allowed.
+
+allowed-attributes
+    Additional HTML attributes (independent from elements) to allow in the
+    generated output, comma-separated. By default, only *align* and *href* are
+    allowed.
+
+To allow images in comments, you just need to add ``allowed-elements = img`` and
+``allowed-attributes = src``.
+

 Appendum
 --------
--- a/docs/isso.example.cfg
+++ b/docs/isso.example.cfg
@ -110,3 +110,22 @@ direct-reply = 3
 # comment. After the editing timeframe is gone, commenters can reply to their
 # own comments anyways.  Do not forget to configure the client.
 reply-to-self = false
+
+
+[markup]
+# Customize markup and sanitized HTML. Currently, only Markdown (via Misaka) is
+# supported, but new languages are relatively easy to add.
+
+# Misaka-specific Markdown extensions, all flags starting with EXT_ can be used
+# there, separated by comma.
+options = strikethrough, superscript, autolink
+
+# Additional HTML tags to allow in the generated output, comma-separated. By
+# default, only a, blockquote, br, code, del, em, h1, h2, h3, h4, h5, h6, hr,
+# ins, li, ol, p, pre, strong, table, tbody, td, th, thead and ul are allowed.
+allowed-elements =
+
+# Additional HTML attributes (independent from elements) to allow in the
+# generated output, comma-separated. By default, only align and href are
+# allowed.
+allowed-attributes =
--- a/isso/init.py
+++ b/isso/init.py
@ -64,7 +64,7 @@ local_manager = LocalManager([local])

 from isso import db, migrate, wsgi, ext, views
 from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config
-from isso.utils import parse, http, JSONRequest, origin
+from isso.utils import parse, http, JSONRequest, origin, html
 from isso.views import comments

 from isso.ext.notifications import Stdout, SMTP
@ -86,6 +86,7 @@ class Isso(object):
        self.conf = conf
        self.db = db.SQLite3(conf.get('general', 'dbpath'), conf)
        self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key'))
+        self.markup = html.Markup(conf.section('markup'))

        super(Isso, self).__init__(conf)

@ -102,6 +103,9 @@ class Isso(object):
        views.Info(self)
        comments.API(self)

+    def render(self, text):
+        return self.markup.render(text)
+
    def sign(self, obj):
        return self.signer.dumps(obj)

--- a/isso/compat.py
+++ b/isso/compat.py
@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2
 if not PY2K:

    map, zip, filter = map, zip, filter
+    from functools import reduce

    text_type = str
    string_types = (str, )
@ -15,6 +16,7 @@ else:

    from itertools import imap, izip, ifilter
    map, zip, filter = imap, izip, ifilter
+    reduce = reduce

    text_type = unicode
    string_types = (str, unicode)
--- a/isso/core.py
+++ b/isso/core.py
@ -44,6 +44,9 @@ class Section:
    def getint(self, key):
        return self.conf.getint(self.section, key)

+    def getlist(self, key):
+        return self.conf.getlist(self.section, key)
+
    def getiter(self, key):
        return self.conf.getiter(self.section, key)

@ -62,6 +65,7 @@ class IssoParser(ConfigParser):
    ... [foo]
    ... bar = 1h
    ... baz = 12
+    ... spam = a, b, cdef
    ... bla =
    ...     spam
    ...     ham
@ -71,6 +75,8 @@ class IssoParser(ConfigParser):
    3600
    >>> parser.getint("foo", "baz")
    12
+    >>> parser.getlist("foo", "spam")  # doctest: +IGNORE_UNICODE
+    ['a', 'b', 'cdef']
    >>> list(parser.getiter("foo", "bla"))  # doctest: +IGNORE_UNICODE
    ['spam', 'ham']
    >>> list(parser.getiter("foo", "asd"))  # doctest: +IGNORE_UNICODE
@ -92,6 +98,9 @@ class IssoParser(ConfigParser):
            except AttributeError:
                return int(IssoParser._total_seconds(delta))

+    def getlist(self, section, key):
+        return list(map(str.strip, self.get(section, key).split(',')))
+
    def getiter(self, section, key):
        for item in map(str.strip, self.get(section, key).split('\n')):
            if item:
@ -123,7 +132,11 @@ class Config:
        "enabled = true",
        "ratelimit = 2",
        "direct-reply = 3",
-        "reply-to-self = false"
+        "reply-to-self = false",
+        "[markup]",
+        "options = strikethrough, superscript, autolink",
+        "allowed-elements = ",
+        "allowed-attributes = "
    ]

    @classmethod
--- a/isso/utils/html.py
+++ b/isso/utils/html.py
@ -1,9 +1,12 @@
 # -*- encoding: utf-8 -*-

 import pkg_resources
+import operator
+
+from isso.compat import reduce

 import html5lib
-setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version)
+html5lib_version = pkg_resources.get_distribution("html5lib").version

 from html5lib.sanitizer import HTMLSanitizer
 from html5lib.serializer import HTMLSerializer
@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker
 import misaka


-class MarkdownSanitizer(HTMLSanitizer):
+def Sanitizer(elements, attributes):

-    # attributes found in Sundown's HTML serializer [1] except for <img> tag,
-    # because images are not generated anyways.
-    #
-    # [1] https://github.com/vmg/sundown/blob/master/html/html.c
-    allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
-                        "pre", "code", "blockquote",
-                        "del", "ins", "strong", "em",
-                        "h1", "h2", "h3", "h4", "h5", "h6",
-                        "table", "thead", "tbody", "th", "td"]
+    class Inner(HTMLSanitizer):

-    # href for <a> and align for <table>
-    allowed_attributes = ["align", "href"]
+        # attributes found in Sundown's HTML serializer [1] except for <img> tag,
+        # because images are not generated anyways.
+        #
+        # [1] https://github.com/vmg/sundown/blob/master/html/html.c
+        allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
+                            "pre", "code", "blockquote",
+                            "del", "ins", "strong", "em",
+                            "h1", "h2", "h3", "h4", "h5", "h6",
+                            "table", "thead", "tbody", "th", "td"] + elements

-    # remove disallowed tokens from the output
-    def disallowed_token(self, token, token_type):
-        return None
+        # href for <a> and align for <table>
+        allowed_attributes = ["align", "href"] + attributes
+
+        # remove disallowed tokens from the output
+        def disallowed_token(self, token, token_type):
+            return None
+
+    return Inner


-def sanitize(document):
+def sanitize(tokenizer, document):

-    parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
+    parser = html5lib.HTMLParser(tokenizer=tokenizer)
    domtree = parser.parseFragment(document)

-    builder = "simpletree" if html5lib.version == "0.95" else "etree"
+    builder = "simpletree" if html5lib_version == "0.95" else "etree"
    stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    return serializer.render(stream)


-def markdown(text):
-    """Convert Markdown to (safe) HTML.
+def Markdown(extensions=("strikethrough", "superscript", "autolink")):

-    >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
-    '<p><em>Ohai!</em></p>'
-    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
-    '<p><em>Hi</em></p>'
-    >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
-    "<p>alert('Onoe')</p>"
-    >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
-    '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
-    """
+    flags = reduce(operator.xor, map(
+        lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)

-    # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
-    exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
+    def inner(text):
+        rv = misaka.html(text, extensions=flags).rstrip("\n")
+        if not rv.endswith("<p>") and not rv.endswith("</p>"):
+            return "<p>" + rv + "</p>"
+        return rv

-    # remove HTML tags, skip <img> (for now) and only render "safe" protocols
-    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+    return inner

-    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
-    if not rv.startswith("<p>") and not rv.endswith("</p>"):
-        rv = "<p>" + rv + "</p>"

-    return sanitize(rv)
+class Markup(object):
+
+    def __init__(self, conf):
+
+        parser = Markdown(conf.getlist("options"))
+        sanitizer = Sanitizer(
+            conf.getlist("allowed-elements"),
+            conf.getlist("allowed-attributes"))
+
+        self._render = lambda text: sanitize(sanitizer, parser(text))
+
+    def render(self, text):
+        return self._render(text)
--- a/isso/views/comments.py
+++ b/isso/views/comments.py
@ -163,7 +163,7 @@ class API(object):
            value=self.isso.sign([rv["id"], sha1(rv["text"])]),
            max_age=self.conf.getint('max-age'))

-        rv["text"] = html.markdown(rv["text"])
+        rv["text"] = self.isso.render(rv["text"])
        rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")

        self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@ -189,7 +189,7 @@ class API(object):
            rv.pop(key)

        if request.args.get('plain', '0') == '0':
-            rv['text'] = html.markdown(rv['text'])
+            rv['text'] = self.isso.render(rv['text'])

        return JSON(rv, 200)

@ -230,7 +230,7 @@ class API(object):
                value=self.isso.sign([rv["id"], sha1(rv["text"])]),
                max_age=self.conf.getint('max-age'))

-        rv["text"] = html.markdown(rv["text"])
+        rv["text"] = self.isso.render(rv["text"])

        resp = JSON(rv, 200)
        resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@ -336,7 +336,7 @@ class API(object):

        if request.args.get('plain', '0') == '0':
            for item in rv:
-                item['text'] = html.markdown(item['text'])
+                item['text'] = self.isso.render(item['text'])

        return JSON(rv, 200)

--- a/specs/test_html.py
+++ b/specs/test_html.py
@ -0,0 +1,60 @@
+
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+
+
+from isso.core import Config
+from isso.utils import html
+
+
+class TestHTML(unittest.TestCase):
+
+    def test_markdown(self):
+        convert = html.Markdown(extensions=())
+        examples = [
+            ("*Ohai!*", "<p><em>Ohai!</em></p>"),
+            ("<em>Hi</em>", "<p><em>Hi</em></p>"),
+            ("http://example.org/", '<p>http://example.org/</p>')]
+
+        for (input, expected) in examples:
+            self.assertEqual(convert(input), expected)
+
+    def test_markdown_extensions(self):
+        convert = html.Markdown(extensions=("strikethrough", "superscript"))
+        examples = [
+            ("~~strike~~ through", "<p><del>strike</del> through</p>"),
+            ("sup^(script)", "<p>sup<sup>script</sup></p>")]
+
+        for (input, expected) in examples:
+            self.assertEqual(convert(input), expected)
+
+    @unittest.skipIf(html.html5lib_version == "0.95", "backport")
+    def test_sanitizer(self):
+        sanitizer = html.Sanitizer(elements=[], attributes=[])
+        examples = [
+            ('Look: <img src="..." />', 'Look: '),
+            ('<a href="http://example.org/">Ha</a>', '<a href="http://example.org/">Ha</a>'),
+            ('<a href="sms:+1234567890">Ha</a>', '<a>Ha</a>'),
+            ('<p style="visibility: hidden;">Test</p>', '<p>Test</p>'),
+            ('<script>alert("Onoe")</script>', 'alert("Onoe")')]
+
+        for (input, expected) in examples:
+            self.assertEqual(html.sanitize(sanitizer, input), expected)
+
+    @unittest.skipIf(html.html5lib_version == "0.95", "backport")
+    def test_sanitizer_extensions(self):
+        sanitizer = html.Sanitizer(elements=["img"], attributes=["src"])
+        examples = [
+            ('<img src="cat.gif" />', '<img src="cat.gif">'),
+            ('<script src="doge.js"></script>', '')]
+
+        for (input, expected) in examples:
+            self.assertEqual(html.sanitize(sanitizer, input), expected)
+
+    def test_render(self):
+        conf = Config.load(None).section("markup")
+        renderer = html.Markup(conf).render
+        self.assertEqual(renderer("http://example.org/ and sms:+1234567890"),
+                         '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>')