refactor markup and sanitization code

This commit introduces a new configuration section [markup] to refine
Misaka's Markdown extensions (by default strikethrough, superscript and
autolink).

Furthermore, you can set custom HTML elements/attributes that are
allowed, e.g. to enable images, set

  [markup]
  allowed-elements = img
  allowed-attributes = src

The refactorization separates HTML sanitization from Markdown -> HTML
and allows to include new markup languages such as BB Code or
reStructuredText.
This commit is contained in:
Martin Zimmermann 2014-02-18 16:52:26 +01:00
parent 6071a85787
commit d93d77c8c7
5 changed files with 64 additions and 44 deletions

View File

@ -64,7 +64,7 @@ local_manager = LocalManager([local])
from isso import db, migrate, wsgi, ext, views
from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config
from isso.utils import parse, http, JSONRequest, origin
from isso.utils import parse, http, JSONRequest, origin, html
from isso.views import comments
from isso.ext.notifications import Stdout, SMTP
@ -86,6 +86,7 @@ class Isso(object):
self.conf = conf
self.db = db.SQLite3(conf.get('general', 'dbpath'), conf)
self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key'))
self.markup = html.Markup(conf.section('markup'))
super(Isso, self).__init__(conf)
@ -102,6 +103,9 @@ class Isso(object):
views.Info(self)
comments.API(self)
def render(self, text):
return self.markup.render(text)
def sign(self, obj):
return self.signer.dumps(obj)

View File

@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2
if not PY2K:
map, zip, filter = map, zip, filter
from functools import reduce
text_type = str
string_types = (str, )
@ -15,6 +16,7 @@ else:
from itertools import imap, izip, ifilter
map, zip, filter = imap, izip, ifilter
reduce = reduce
text_type = unicode
string_types = (str, unicode)

View File

@ -132,7 +132,11 @@ class Config:
"enabled = true",
"ratelimit = 2",
"direct-reply = 3",
"reply-to-self = false"
"reply-to-self = false",
"[markup]",
"options = strikethrough, superscript, autolink",
"allowed-elements = ",
"allowed-attributes = "
]
@classmethod

View File

@ -1,9 +1,12 @@
# -*- encoding: utf-8 -*-
import pkg_resources
import operator
from isso.compat import reduce
import html5lib
setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version)
html5lib_version = pkg_resources.get_distribution("html5lib").version
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer
@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker
import misaka
class MarkdownSanitizer(HTMLSanitizer):
def Sanitizer(elements, attributes):
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"]
class Inner(HTMLSanitizer):
# href for <a> and align for <table>
allowed_attributes = ["align", "href"]
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"] + elements
# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
# href for <a> and align for <table>
allowed_attributes = ["align", "href"] + attributes
# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
return Inner
def sanitize(document):
def sanitize(tokenizer, document):
parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
parser = html5lib.HTMLParser(tokenizer=tokenizer)
domtree = parser.parseFragment(document)
builder = "simpletree" if html5lib.version == "0.95" else "etree"
builder = "simpletree" if html5lib_version == "0.95" else "etree"
stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
return serializer.render(stream)
def markdown(text):
"""Convert Markdown to (safe) HTML.
def Markdown(extensions=("strikethrough", "superscript", "autolink")):
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
'<p><em>Ohai!</em></p>'
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
'<p><em>Hi</em></p>'
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
"<p>alert('Onoe')</p>"
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
"""
flags = reduce(operator.xor, map(
lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
def inner(text):
rv = misaka.html(text, extensions=flags).rstrip("\n")
if not rv.endswith("<p>") and not rv.endswith("</p>"):
return "<p>" + rv + "</p>"
return rv
# remove HTML tags, skip <img> (for now) and only render "safe" protocols
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
return inner
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
if not rv.startswith("<p>") and not rv.endswith("</p>"):
rv = "<p>" + rv + "</p>"
return sanitize(rv)
class Markup(object):
def __init__(self, conf):
parser = Markdown(conf.getlist("options"))
sanitizer = Sanitizer(
conf.getlist("allowed-elements"),
conf.getlist("allowed-attributes"))
self._render = lambda text: sanitize(sanitizer, parser(text))
def render(self, text):
return self._render(text)

View File

@ -163,7 +163,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
rv["text"] = html.markdown(rv["text"])
rv["text"] = self.isso.render(rv["text"])
rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@ -189,7 +189,7 @@ class API(object):
rv.pop(key)
if request.args.get('plain', '0') == '0':
rv['text'] = html.markdown(rv['text'])
rv['text'] = self.isso.render(rv['text'])
return JSON(rv, 200)
@ -230,7 +230,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
rv["text"] = html.markdown(rv["text"])
rv["text"] = self.isso.render(rv["text"])
resp = JSON(rv, 200)
resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@ -336,7 +336,7 @@ class API(object):
if request.args.get('plain', '0') == '0':
for item in rv:
item['text'] = html.markdown(item['text'])
item['text'] = self.isso.render(item['text'])
return JSON(rv, 200)