Use bleach to sanitize HTML; allows use with newer versions of html5lib.

This fixes #296
pull/314/head
Jelmer Vernooij 7 years ago committed by Jelmer Vernooij
parent 7bfe36bfdf
commit 2790604e35
No known key found for this signature in database
GPG Key ID: 1DF7EADF3B648883

@ -63,7 +63,6 @@ class TestHTML(unittest.TestCase):
print("Hello, World")
</code></pre>""")
@unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport")
def test_sanitizer(self):
sanitizer = html.Sanitizer(elements=[], attributes=[])
examples = [
@ -74,9 +73,8 @@ class TestHTML(unittest.TestCase):
('<script>alert("Onoe")</script>', 'alert("Onoe")')]
for (input, expected) in examples:
self.assertEqual(html.sanitize(sanitizer, input), expected)
self.assertEqual(sanitizer.sanitize(input), expected)
@unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport")
def test_sanitizer_extensions(self):
sanitizer = html.Sanitizer(elements=["img"], attributes=["src"])
examples = [
@ -84,7 +82,7 @@ class TestHTML(unittest.TestCase):
('<script src="doge.js"></script>', '')]
for (input, expected) in examples:
self.assertEqual(html.sanitize(sanitizer, input), expected)
self.assertEqual(sanitizer.sanitize(input), expected)
def test_render(self):
conf = config.new({

@ -7,56 +7,36 @@ import pkg_resources
from distutils.version import LooseVersion as Version
HTML5LIB_VERSION = Version(pkg_resources.get_distribution("html5lib").version)
HTML5LIB_SIMPLETREE = Version("0.95")
from isso.compat import reduce
import html5lib
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer
import bleach
import misaka
def Sanitizer(elements, attributes):
class Inner(HTMLSanitizer):
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"] + elements
# href for <a> and align for <table>
allowed_attributes = ["align", "href"] + attributes
# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
return Inner
# attributes found in Sundown's HTML serializer [1] except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
ALLOWED_ELEMENTS = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"]
def sanitize(tokenizer, document):
# href for <a> and align for <table>
ALLOWED_ATTRIBUTES = ["align", "href"]
parser = html5lib.HTMLParser(tokenizer=tokenizer)
domtree = parser.parseFragment(document)
if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE:
builder = "etree"
else:
builder = "simpletree"
class Sanitizer(object):
stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
def __init__(self, elements, attributes):
self.elements = ALLOWED_ELEMENTS + elements
self.attributes = ALLOWED_ATTRIBUTES + attributes
return serializer.render(stream)
def sanitize(self, text):
return bleach.clean(text, tags=self.elements,
attributes=self.attributes, strip=True)
def Markdown(extensions=("strikethrough", "superscript", "autolink")):
@ -96,7 +76,7 @@ class Markup(object):
conf.getlist("allowed-elements"),
conf.getlist("allowed-attributes"))
self._render = lambda text: sanitize(sanitizer, parser(text))
self._render = lambda text: sanitizer.sanitize(parser(text))
def render(self, text):
return self._render(text)

@ -5,7 +5,7 @@ import sys
from setuptools import setup, find_packages
requires = ['itsdangerous', 'misaka>=1.0,<2.0', 'html5lib==0.9999999']
requires = ['itsdangerous', 'misaka>=1.0,<2.0', 'html5lib', 'bleach']
if (3, 0) <= sys.version_info < (3, 3):
raise SystemExit("Python 3.0, 3.1 and 3.2 are not supported")

@ -23,7 +23,8 @@ deps =
[testenv:debian]
deps=
html5lib==0.95
bleach
html5lib
ipaddr==2.1.10
itsdangerous==0.22
misaka==1.0.2

Loading…
Cancel
Save