use html5lib's sanitizer, supersedes 3713d5e

Python's HTMLParser is smart enough to filter malicious tags but fails
to repair invalid, user-inputted HTML. Instead of re-inventing the
wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all
tags generated by Sundown.

Disallowed tags are discarded from the output to match the previous
unittests. This feature is only available for html5lib 0.99(9) and
later. Earlier releases just escape disallowed tags.
This commit is contained in:
Martin Zimmermann 2014-01-13 18:14:29 +01:00
parent 3713d5e8ee
commit 3a1f92b8bd
3 changed files with 71 additions and 88 deletions

View File

@ -14,7 +14,6 @@ try:
except ImportError:
from HTMLParser import HTMLParser, HTMLParseError
from werkzeug.utils import escape
from werkzeug.wrappers import Request, Response
from werkzeug.exceptions import BadRequest
@ -23,8 +22,6 @@ try:
except ImportError:
import ipaddr as ipaddress
import misaka
def anonymize(remote_addr):
"""
@ -127,86 +124,6 @@ class JSONResponse(Response):
json.dumps(obj).encode("utf-8"), *args, **kwargs)
class Sanitizer(HTMLParser, object):
"""Sanitize HTML output: remove unsafe HTML tags such as iframe or
script based on a whitelist of allowed tags."""
safe = set([
"p", "a", "pre", "blockquote",
"h1", "h2", "h3", "h4", "h5", "h6",
"em", "sub", "sup", "del", "ins", "math",
"dl", "ol", "ul", "li"])
@classmethod
def format(cls, attrs):
res = []
for key, value in attrs:
if value is None:
res.append(key)
else:
res.append(u'{0}="{1}"'.format(key, escape(value)))
return ' '.join(res)
def __init__(self, html):
super(Sanitizer, self).__init__()
self.result = io.StringIO()
self.feed(html)
self.result.seek(0)
def handle_starttag(self, tag, attrs):
if tag in Sanitizer.safe:
self.result.write(u"<" + tag)
if attrs:
self.result.write(" " + Sanitizer.format(attrs))
self.result.write(u">")
def handle_data(self, data):
self.result.write(data)
def handle_endtag(self, tag):
if tag in Sanitizer.safe:
self.result.write(u"</" + tag + ">")
def handle_startendtag(self, tag, attrs):
if tag in Sanitizer.safe:
self.result.write(u"<" + tag)
if attrs:
self.result.write(" " + Sanitizer.format(attrs))
self.result.write(u"/>")
def handle_entityref(self, name):
self.result.write(u'&' + name + ';')
def handle_charref(self, char):
self.result.write(u'&#' + char + ';')
def markdown(text):
"""Convert Markdown to (safe) HTML.
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
'<p><em>Ohai!</em></p>'
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
'<p><em>Hi</em></p>'
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
"<p>alert('Onoe')</p>"
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
"""
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
# remove HTML tags, skip <img> (for now) and only render "safe" protocols
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
if not rv.startswith("<p>") and not rv.endswith("</p>"):
rv = "<p>" + rv + "</p>"
return Sanitizer(rv).result.read()
def origin(hosts):
hosts = [x.rstrip("/") for x in hosts]

66
isso/utils/html.py Normal file
View File

@ -0,0 +1,66 @@
# -*- encoding: utf-8 -*-
import html5lib
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer
from html5lib.treewalkers import getTreeWalker
import misaka
class MarkdownSanitizer(HTMLSanitizer):
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"]
# href for <a> and align for <table>
allowed_attributes = ["align", "href"]
# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
def sanitize(document):
parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
domtree = parser.parseFragment(document)
stream = html5lib.treewalkers.getTreeWalker('etree')(domtree)
serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
return serializer.render(stream)
def markdown(text):
"""Convert Markdown to (safe) HTML.
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
'<p><em>Ohai!</em></p>'
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
'<p><em>Hi</em></p>'
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
"<p>alert('Onoe')</p>"
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
"""
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
# remove HTML tags, skip <img> (for now) and only render "safe" protocols
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
if not rv.startswith("<p>") and not rv.endswith("</p>"):
rv = "<p>" + rv + "</p>"
return sanitize(rv)

View File

@ -16,7 +16,7 @@ from werkzeug.exceptions import BadRequest, Forbidden, NotFound
from isso.compat import text_type as str
from isso import utils, local
from isso.utils import http, parse, markdown, JSONResponse as JSON
from isso.utils import http, parse, html, JSONResponse as JSON
from isso.utils.crypto import pbkdf2
from isso.views import requires
@ -163,7 +163,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
rv["text"] = markdown(rv["text"])
rv["text"] = html.markdown(rv["text"])
rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@ -189,7 +189,7 @@ class API(object):
rv.pop(key)
if request.args.get('plain', '0') == '0':
rv['text'] = markdown(rv['text'])
rv['text'] = html.markdown(rv['text'])
return JSON(rv, 200)
@ -230,7 +230,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
rv["text"] = markdown(rv["text"])
rv["text"] = html.markdown(rv["text"])
resp = JSON(rv, 200)
resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@ -336,7 +336,7 @@ class API(object):
if request.args.get('plain', '0') == '0':
for item in rv:
item['text'] = markdown(item['text'])
item['text'] = html.markdown(item['text'])
return JSON(rv, 200)