From d93d77c8c7ef9deb022b5cdaff2abcf7a26aa6af Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Tue, 18 Feb 2014 16:52:26 +0100 Subject: [PATCH] refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. --- isso/__init__.py | 6 ++- isso/compat.py | 2 + isso/core.py | 6 ++- isso/utils/html.py | 86 +++++++++++++++++++++++------------------- isso/views/comments.py | 8 ++-- 5 files changed, 64 insertions(+), 44 deletions(-) diff --git a/isso/__init__.py b/isso/__init__.py index c77b07e..6d48af4 100644 --- a/isso/__init__.py +++ b/isso/__init__.py @@ -64,7 +64,7 @@ local_manager = LocalManager([local]) from isso import db, migrate, wsgi, ext, views from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config -from isso.utils import parse, http, JSONRequest, origin +from isso.utils import parse, http, JSONRequest, origin, html from isso.views import comments from isso.ext.notifications import Stdout, SMTP @@ -86,6 +86,7 @@ class Isso(object): self.conf = conf self.db = db.SQLite3(conf.get('general', 'dbpath'), conf) self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key')) + self.markup = html.Markup(conf.section('markup')) super(Isso, self).__init__(conf) @@ -102,6 +103,9 @@ class Isso(object): views.Info(self) comments.API(self) + def render(self, text): + return self.markup.render(text) + def sign(self, obj): return self.signer.dumps(obj) diff --git a/isso/compat.py b/isso/compat.py index 772d561..ac09826 100644 --- a/isso/compat.py +++ b/isso/compat.py @@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2 if not PY2K: map, zip, filter = map, zip, filter + from functools import reduce text_type = str string_types = (str, ) @@ -15,6 +16,7 @@ else: from itertools import imap, izip, ifilter map, zip, filter = imap, izip, ifilter + reduce = reduce text_type = unicode string_types = (str, unicode) diff --git a/isso/core.py b/isso/core.py index bb0c1eb..ce2abf2 100644 --- a/isso/core.py +++ b/isso/core.py @@ -132,7 +132,11 @@ class Config: "enabled = true", "ratelimit = 2", "direct-reply = 3", - "reply-to-self = false" + "reply-to-self = false", + "[markup]", + "options = strikethrough, superscript, autolink", + "allowed-elements = ", + "allowed-attributes = " ] @classmethod diff --git a/isso/utils/html.py b/isso/utils/html.py index 34a4dc7..5650bfd 100644 --- a/isso/utils/html.py +++ b/isso/utils/html.py @@ -1,9 +1,12 @@ # -*- encoding: utf-8 -*- import pkg_resources +import operator + +from isso.compat import reduce import html5lib -setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version) +html5lib_version = pkg_resources.get_distribution("html5lib").version from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer import HTMLSerializer @@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker import misaka -class MarkdownSanitizer(HTMLSanitizer): +def Sanitizer(elements, attributes): - # attributes found in Sundown's HTML serializer [1] except for tag, - # because images are not generated anyways. - # - # [1] https://github.com/vmg/sundown/blob/master/html/html.c - allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", - "pre", "code", "blockquote", - "del", "ins", "strong", "em", - "h1", "h2", "h3", "h4", "h5", "h6", - "table", "thead", "tbody", "th", "td"] + class Inner(HTMLSanitizer): - # href for and align for - allowed_attributes = ["align", "href"] + # attributes found in Sundown's HTML serializer [1] except for tag, + # because images are not generated anyways. + # + # [1] https://github.com/vmg/sundown/blob/master/html/html.c + allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "td"] + elements - # remove disallowed tokens from the output - def disallowed_token(self, token, token_type): - return None + # href for and align for
+ allowed_attributes = ["align", "href"] + attributes + + # remove disallowed tokens from the output + def disallowed_token(self, token, token_type): + return None + + return Inner -def sanitize(document): +def sanitize(tokenizer, document): - parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer) + parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) - builder = "simpletree" if html5lib.version == "0.95" else "etree" + builder = "simpletree" if html5lib_version == "0.95" else "etree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream) -def markdown(text): - """Convert Markdown to (safe) HTML. +def Markdown(extensions=("strikethrough", "superscript", "autolink")): - >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE - '

Ohai!

' - >>> markdown("Hi") # doctest: +IGNORE_UNICODE - '

Hi

' - >>> markdown("") # doctest: +IGNORE_UNICODE - "

alert('Onoe')

" - >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE - '

http://example.org/ and sms:+1234567890

' - """ + flags = reduce(operator.xor, map( + lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0) - # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link - exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK + def inner(text): + rv = misaka.html(text, extensions=flags).rstrip("\n") + if not rv.endswith("

") and not rv.endswith("

"): + return "

" + rv + "

" + return rv - # remove HTML tags, skip (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + return inner - rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") - if not rv.startswith("

") and not rv.endswith("

"): - rv = "

" + rv + "

" - return sanitize(rv) +class Markup(object): + + def __init__(self, conf): + + parser = Markdown(conf.getlist("options")) + sanitizer = Sanitizer( + conf.getlist("allowed-elements"), + conf.getlist("allowed-attributes")) + + self._render = lambda text: sanitize(sanitizer, parser(text)) + + def render(self, text): + return self._render(text) diff --git a/isso/views/comments.py b/isso/views/comments.py index 1262170..c231ead 100644 --- a/isso/views/comments.py +++ b/isso/views/comments.py @@ -163,7 +163,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = html.markdown(rv["text"]) + rv["text"] = self.isso.render(rv["text"]) rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8") self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash']) @@ -189,7 +189,7 @@ class API(object): rv.pop(key) if request.args.get('plain', '0') == '0': - rv['text'] = html.markdown(rv['text']) + rv['text'] = self.isso.render(rv['text']) return JSON(rv, 200) @@ -230,7 +230,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = html.markdown(rv["text"]) + rv["text"] = self.isso.render(rv["text"]) resp = JSON(rv, 200) resp.headers.add("Set-Cookie", cookie(str(rv["id"]))) @@ -336,7 +336,7 @@ class API(object): if request.args.get('plain', '0') == '0': for item in rv: - item['text'] = html.markdown(item['text']) + item['text'] = self.isso.render(item['text']) return JSON(rv, 200)