use html5lib's sanitizer, supersedes 3713d5e
Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags.
This commit is contained in:
parent
3713d5e8ee
commit
3a1f92b8bd
@ -14,7 +14,6 @@ try:
|
||||
except ImportError:
|
||||
from HTMLParser import HTMLParser, HTMLParseError
|
||||
|
||||
from werkzeug.utils import escape
|
||||
from werkzeug.wrappers import Request, Response
|
||||
from werkzeug.exceptions import BadRequest
|
||||
|
||||
@ -23,8 +22,6 @@ try:
|
||||
except ImportError:
|
||||
import ipaddr as ipaddress
|
||||
|
||||
import misaka
|
||||
|
||||
|
||||
def anonymize(remote_addr):
|
||||
"""
|
||||
@ -127,86 +124,6 @@ class JSONResponse(Response):
|
||||
json.dumps(obj).encode("utf-8"), *args, **kwargs)
|
||||
|
||||
|
||||
class Sanitizer(HTMLParser, object):
|
||||
"""Sanitize HTML output: remove unsafe HTML tags such as iframe or
|
||||
script based on a whitelist of allowed tags."""
|
||||
|
||||
safe = set([
|
||||
"p", "a", "pre", "blockquote",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"em", "sub", "sup", "del", "ins", "math",
|
||||
"dl", "ol", "ul", "li"])
|
||||
|
||||
@classmethod
|
||||
def format(cls, attrs):
|
||||
res = []
|
||||
for key, value in attrs:
|
||||
if value is None:
|
||||
res.append(key)
|
||||
else:
|
||||
res.append(u'{0}="{1}"'.format(key, escape(value)))
|
||||
return ' '.join(res)
|
||||
|
||||
def __init__(self, html):
|
||||
super(Sanitizer, self).__init__()
|
||||
self.result = io.StringIO()
|
||||
self.feed(html)
|
||||
self.result.seek(0)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in Sanitizer.safe:
|
||||
self.result.write(u"<" + tag)
|
||||
if attrs:
|
||||
self.result.write(" " + Sanitizer.format(attrs))
|
||||
self.result.write(u">")
|
||||
|
||||
def handle_data(self, data):
|
||||
self.result.write(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in Sanitizer.safe:
|
||||
self.result.write(u"</" + tag + ">")
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if tag in Sanitizer.safe:
|
||||
self.result.write(u"<" + tag)
|
||||
if attrs:
|
||||
self.result.write(" " + Sanitizer.format(attrs))
|
||||
self.result.write(u"/>")
|
||||
|
||||
def handle_entityref(self, name):
|
||||
self.result.write(u'&' + name + ';')
|
||||
|
||||
def handle_charref(self, char):
|
||||
self.result.write(u'&#' + char + ';')
|
||||
|
||||
|
||||
def markdown(text):
|
||||
"""Convert Markdown to (safe) HTML.
|
||||
|
||||
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
|
||||
'<p><em>Ohai!</em></p>'
|
||||
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
|
||||
'<p><em>Hi</em></p>'
|
||||
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
|
||||
"<p>alert('Onoe')</p>"
|
||||
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
|
||||
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
|
||||
"""
|
||||
|
||||
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
|
||||
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
|
||||
|
||||
# remove HTML tags, skip <img> (for now) and only render "safe" protocols
|
||||
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
|
||||
|
||||
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
|
||||
if not rv.startswith("<p>") and not rv.endswith("</p>"):
|
||||
rv = "<p>" + rv + "</p>"
|
||||
|
||||
return Sanitizer(rv).result.read()
|
||||
|
||||
|
||||
def origin(hosts):
|
||||
|
||||
hosts = [x.rstrip("/") for x in hosts]
|
||||
|
66
isso/utils/html.py
Normal file
66
isso/utils/html.py
Normal file
@ -0,0 +1,66 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import html5lib
|
||||
|
||||
from html5lib.sanitizer import HTMLSanitizer
|
||||
from html5lib.serializer import HTMLSerializer
|
||||
from html5lib.treewalkers import getTreeWalker
|
||||
|
||||
import misaka
|
||||
|
||||
|
||||
class MarkdownSanitizer(HTMLSanitizer):
|
||||
|
||||
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
|
||||
# because images are not generated anyways.
|
||||
#
|
||||
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
|
||||
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
|
||||
"pre", "code", "blockquote",
|
||||
"del", "ins", "strong", "em",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"table", "thead", "tbody", "th", "td"]
|
||||
|
||||
# href for <a> and align for <table>
|
||||
allowed_attributes = ["align", "href"]
|
||||
|
||||
# remove disallowed tokens from the output
|
||||
def disallowed_token(self, token, token_type):
|
||||
return None
|
||||
|
||||
|
||||
def sanitize(document):
|
||||
|
||||
parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
|
||||
domtree = parser.parseFragment(document)
|
||||
|
||||
stream = html5lib.treewalkers.getTreeWalker('etree')(domtree)
|
||||
serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
|
||||
|
||||
return serializer.render(stream)
|
||||
|
||||
|
||||
def markdown(text):
|
||||
"""Convert Markdown to (safe) HTML.
|
||||
|
||||
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
|
||||
'<p><em>Ohai!</em></p>'
|
||||
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
|
||||
'<p><em>Hi</em></p>'
|
||||
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
|
||||
"<p>alert('Onoe')</p>"
|
||||
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
|
||||
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
|
||||
"""
|
||||
|
||||
# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
|
||||
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
|
||||
|
||||
# remove HTML tags, skip <img> (for now) and only render "safe" protocols
|
||||
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
|
||||
|
||||
rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
|
||||
if not rv.startswith("<p>") and not rv.endswith("</p>"):
|
||||
rv = "<p>" + rv + "</p>"
|
||||
|
||||
return sanitize(rv)
|
@ -16,7 +16,7 @@ from werkzeug.exceptions import BadRequest, Forbidden, NotFound
|
||||
from isso.compat import text_type as str
|
||||
|
||||
from isso import utils, local
|
||||
from isso.utils import http, parse, markdown, JSONResponse as JSON
|
||||
from isso.utils import http, parse, html, JSONResponse as JSON
|
||||
from isso.utils.crypto import pbkdf2
|
||||
from isso.views import requires
|
||||
|
||||
@ -163,7 +163,7 @@ class API(object):
|
||||
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
|
||||
max_age=self.conf.getint('max-age'))
|
||||
|
||||
rv["text"] = markdown(rv["text"])
|
||||
rv["text"] = html.markdown(rv["text"])
|
||||
rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
|
||||
|
||||
self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
|
||||
@ -189,7 +189,7 @@ class API(object):
|
||||
rv.pop(key)
|
||||
|
||||
if request.args.get('plain', '0') == '0':
|
||||
rv['text'] = markdown(rv['text'])
|
||||
rv['text'] = html.markdown(rv['text'])
|
||||
|
||||
return JSON(rv, 200)
|
||||
|
||||
@ -230,7 +230,7 @@ class API(object):
|
||||
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
|
||||
max_age=self.conf.getint('max-age'))
|
||||
|
||||
rv["text"] = markdown(rv["text"])
|
||||
rv["text"] = html.markdown(rv["text"])
|
||||
|
||||
resp = JSON(rv, 200)
|
||||
resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
|
||||
@ -336,7 +336,7 @@ class API(object):
|
||||
|
||||
if request.args.get('plain', '0') == '0':
|
||||
for item in rv:
|
||||
item['text'] = markdown(item['text'])
|
||||
item['text'] = html.markdown(item['text'])
|
||||
|
||||
return JSON(rv, 200)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user