diff --git a/isso/tests/test_html.py b/isso/tests/test_html.py index 2b3b15c..6eb037b 100644 --- a/isso/tests/test_html.py +++ b/isso/tests/test_html.py @@ -59,7 +59,6 @@ class TestHTML(unittest.TestCase): print("Hello, World") """) - @unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport") def test_sanitizer(self): sanitizer = html.Sanitizer(elements=[], attributes=[]) examples = [ @@ -73,11 +72,10 @@ class TestHTML(unittest.TestCase): for (input, expected) in examples: if isinstance(expected, list): - self.assertIn(html.sanitize(sanitizer, input), expected) + self.assertIn(sanitizer.sanitize(input), expected) else: - self.assertEqual(html.sanitize(sanitizer, input), expected) + self.assertEqual(sanitizer.sanitize(input), expected) - @unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport") def test_sanitizer_extensions(self): sanitizer = html.Sanitizer(elements=["img"], attributes=["src"]) examples = [ @@ -85,7 +83,7 @@ class TestHTML(unittest.TestCase): ('', '')] for (input, expected) in examples: - self.assertEqual(html.sanitize(sanitizer, input), expected) + self.assertEqual(sanitizer.sanitize(input), expected) def test_render(self): conf = config.new({ diff --git a/isso/utils/html.py b/isso/utils/html.py index 4acfcc0..1235b8a 100644 --- a/isso/utils/html.py +++ b/isso/utils/html.py @@ -6,61 +6,53 @@ import pkg_resources from distutils.version import LooseVersion as Version -HTML5LIB_VERSION = Version(pkg_resources.get_distribution("html5lib").version) -HTML5LIB_SIMPLETREE = Version("0.95") - -import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer import HTMLSerializer - +import bleach import misaka -def Sanitizer(elements, attributes): - - class Inner(HTMLSanitizer): +class Sanitizer(object): + def __init__(self, elements, attributes): # attributes found in Sundown's HTML serializer [1] # except for tag, # because images are not generated anyways. # # [1] https://github.com/vmg/sundown/blob/master/html/html.c - allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + self.elements = ["a", "p", "hr", "br", "ol", "ul", "li", "pre", "code", "blockquote", "del", "ins", "strong", "em", "h1", "h2", "h3", "h4", "h5", "h6", "table", "thead", "tbody", "th", "td"] + elements # href for and align for - allowed_attributes = ["align", "href"] + attributes - - # remove disallowed tokens from the output - def disallowed_token(self, token, token_type): - return None - - return Inner + self.attributes = ["align", "href"] + attributes -def sanitize(tokenizer, document): - parser = html5lib.HTMLParser(tokenizer=tokenizer) - domtree = parser.parseFragment(document) + def sanitize(self, text): + clean_html = bleach.clean(text, tags=self.elements, + attributes=self.attributes, strip=True) - if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE: - builder = "etree" + def set_links(attrs, new=False): + href_key = (None, u'href') - for link in domtree.findall(".//{http://www.w3.org/1999/xhtml}a"): - if link.get('href', None): - link.set("rel", "nofollow noopener") + if href_key not in attrs: + return attrs + if attrs[href_key].startswith(u'mailto:'): + return attrs - else: - builder = "simpletree" + rel_key = (None, u'rel') + rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val] - stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) - serializer = HTMLSerializer( - quote_attr_values=True, omit_optional_tags=False) + for value in [u'nofollow', u'noopener']: + if value not in [rel_val.lower() for rel_val in rel_values]: + rel_values.append(value) - return serializer.render(stream) + attrs[rel_key] = u' '.join(rel_values) + return attrs + + linker = bleach.linkifier.Linker(callbacks=[set_links]) + return linker.linkify(clean_html) def Markdown(extensions=("strikethrough", "superscript", "autolink", @@ -100,7 +92,7 @@ class Markup(object): conf.getlist("allowed-elements"), conf.getlist("allowed-attributes")) - self._render = lambda text: sanitize(sanitizer, parser(text)) + self._render = lambda text: sanitizer.sanitize(parser(text)) def render(self, text): return self._render(text) diff --git a/setup.py b/setup.py index 74b1c54..e44d3d5 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ import sys from setuptools import setup, find_packages -requires = ['itsdangerous', 'Jinja2', 'misaka>=2.0,<3.0', 'html5lib<0.9999999', - 'werkzeug>=0.9'] +requires = ['itsdangerous', 'Jinja2', 'misaka>=2.0,<3.0', 'html5lib', + 'werkzeug>=0.9', 'bleach'] if sys.version_info < (2, 7): raise SystemExit("Python 2 versions < 2.7 are not supported.") diff --git a/tox.ini b/tox.ini index 4cc7c2c..363336c 100755 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,8 @@ deps = [testenv:debian] deps= - html5lib==0.95 + bleach + html5lib ipaddr==2.1.10 itsdangerous==0.22 misaka==1.0.2