refactor markup and sanitization code

This commit introduces a new configuration section [markup] to refine
Misaka's Markdown extensions (by default strikethrough, superscript and
autolink).

Furthermore, you can set custom HTML elements/attributes that are
allowed, e.g. to enable images, set

  [markup]
  allowed-elements = img
  allowed-attributes = src

The refactorization separates HTML sanitization from Markdown -> HTML
and allows to include new markup languages such as BB Code or
reStructuredText.
This commit is contained in:
Martin Zimmermann 2014-02-18 16:52:26 +01:00
parent 6071a85787
commit d93d77c8c7
5 changed files with 64 additions and 44 deletions

View File

@ -64,7 +64,7 @@ local_manager = LocalManager([local])
from isso import db, migrate, wsgi, ext, views from isso import db, migrate, wsgi, ext, views
from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config
from isso.utils import parse, http, JSONRequest, origin from isso.utils import parse, http, JSONRequest, origin, html
from isso.views import comments from isso.views import comments
from isso.ext.notifications import Stdout, SMTP from isso.ext.notifications import Stdout, SMTP
@ -86,6 +86,7 @@ class Isso(object):
self.conf = conf self.conf = conf
self.db = db.SQLite3(conf.get('general', 'dbpath'), conf) self.db = db.SQLite3(conf.get('general', 'dbpath'), conf)
self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key')) self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key'))
self.markup = html.Markup(conf.section('markup'))
super(Isso, self).__init__(conf) super(Isso, self).__init__(conf)
@ -102,6 +103,9 @@ class Isso(object):
views.Info(self) views.Info(self)
comments.API(self) comments.API(self)
def render(self, text):
return self.markup.render(text)
def sign(self, obj): def sign(self, obj):
return self.signer.dumps(obj) return self.signer.dumps(obj)

View File

@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2
if not PY2K: if not PY2K:
map, zip, filter = map, zip, filter map, zip, filter = map, zip, filter
from functools import reduce
text_type = str text_type = str
string_types = (str, ) string_types = (str, )
@ -15,6 +16,7 @@ else:
from itertools import imap, izip, ifilter from itertools import imap, izip, ifilter
map, zip, filter = imap, izip, ifilter map, zip, filter = imap, izip, ifilter
reduce = reduce
text_type = unicode text_type = unicode
string_types = (str, unicode) string_types = (str, unicode)

View File

@ -132,7 +132,11 @@ class Config:
"enabled = true", "enabled = true",
"ratelimit = 2", "ratelimit = 2",
"direct-reply = 3", "direct-reply = 3",
"reply-to-self = false" "reply-to-self = false",
"[markup]",
"options = strikethrough, superscript, autolink",
"allowed-elements = ",
"allowed-attributes = "
] ]
@classmethod @classmethod

View File

@ -1,9 +1,12 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import pkg_resources import pkg_resources
import operator
from isso.compat import reduce
import html5lib import html5lib
setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version) html5lib_version = pkg_resources.get_distribution("html5lib").version
from html5lib.sanitizer import HTMLSanitizer from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer from html5lib.serializer import HTMLSerializer
@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker
import misaka import misaka
class MarkdownSanitizer(HTMLSanitizer): def Sanitizer(elements, attributes):
# attributes found in Sundown's HTML serializer [1] except for <img> tag, class Inner(HTMLSanitizer):
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"]
# href for <a> and align for <table> # attributes found in Sundown's HTML serializer [1] except for <img> tag,
allowed_attributes = ["align", "href"] # because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"] + elements
# remove disallowed tokens from the output # href for <a> and align for <table>
def disallowed_token(self, token, token_type): allowed_attributes = ["align", "href"] + attributes
return None
# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
return Inner
def sanitize(document): def sanitize(tokenizer, document):
parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer) parser = html5lib.HTMLParser(tokenizer=tokenizer)
domtree = parser.parseFragment(document) domtree = parser.parseFragment(document)
builder = "simpletree" if html5lib.version == "0.95" else "etree" builder = "simpletree" if html5lib_version == "0.95" else "etree"
stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
return serializer.render(stream) return serializer.render(stream)
def markdown(text): def Markdown(extensions=("strikethrough", "superscript", "autolink")):
"""Convert Markdown to (safe) HTML.
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE flags = reduce(operator.xor, map(
'<p><em>Ohai!</em></p>' lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
'<p><em>Hi</em></p>'
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
"<p>alert('Onoe')</p>"
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
"""
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link def inner(text):
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK rv = misaka.html(text, extensions=flags).rstrip("\n")
if not rv.endswith("<p>") and not rv.endswith("</p>"):
return "<p>" + rv + "</p>"
return rv
# remove HTML tags, skip <img> (for now) and only render "safe" protocols return inner
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
if not rv.startswith("<p>") and not rv.endswith("</p>"):
rv = "<p>" + rv + "</p>"
return sanitize(rv) class Markup(object):
def __init__(self, conf):
parser = Markdown(conf.getlist("options"))
sanitizer = Sanitizer(
conf.getlist("allowed-elements"),
conf.getlist("allowed-attributes"))
self._render = lambda text: sanitize(sanitizer, parser(text))
def render(self, text):
return self._render(text)

View File

@ -163,7 +163,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]), value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age')) max_age=self.conf.getint('max-age'))
rv["text"] = html.markdown(rv["text"]) rv["text"] = self.isso.render(rv["text"])
rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8") rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash']) self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@ -189,7 +189,7 @@ class API(object):
rv.pop(key) rv.pop(key)
if request.args.get('plain', '0') == '0': if request.args.get('plain', '0') == '0':
rv['text'] = html.markdown(rv['text']) rv['text'] = self.isso.render(rv['text'])
return JSON(rv, 200) return JSON(rv, 200)
@ -230,7 +230,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]), value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age')) max_age=self.conf.getint('max-age'))
rv["text"] = html.markdown(rv["text"]) rv["text"] = self.isso.render(rv["text"])
resp = JSON(rv, 200) resp = JSON(rv, 200)
resp.headers.add("Set-Cookie", cookie(str(rv["id"]))) resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@ -336,7 +336,7 @@ class API(object):
if request.args.get('plain', '0') == '0': if request.args.get('plain', '0') == '0':
for item in rv: for item in rv:
item['text'] = html.markdown(item['text']) item['text'] = self.isso.render(item['text'])
return JSON(rv, 200) return JSON(rv, 200)