diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py index 334514f..73c81f5 100644 --- a/isso/utils/__init__.py +++ b/isso/utils/__init__.py @@ -5,12 +5,19 @@ from __future__ import division import pkg_resources werkzeug = pkg_resources.get_distribution("werkzeug") +import io import json import random import hashlib from string import ascii_letters, digits +try: + from html.parser import HTMLParser, HTMLParseError +except ImportError: + from HTMLParser import HTMLParser, HTMLParseError + +from werkzeug.utils import escape from werkzeug.wrappers import Request from werkzeug.exceptions import BadRequest @@ -126,13 +133,69 @@ class JSONRequest(Request): raise BadRequest('Unable to read JSON request') +class Sanitizer(HTMLParser, object): + """Sanitize HTML output: remove unsafe HTML tags such as iframe or + script based on a whitelist of allowed tags.""" + + safe = set([ + "p", "a", "pre", "blockquote", + "h1", "h2", "h3", "h4", "h5", "h6", + "em", "sub", "sup", "del", "ins", "math", + "dl", "ol", "ul", "li"]) + + @classmethod + def format(cls, attrs): + res = [] + for key, value in attrs: + if value is None: + res.append(key) + else: + res.append(u'{0}="{1}"'.format(key, escape(value))) + return ' '.join(res) + + def __init__(self, html): + super(Sanitizer, self).__init__() + self.result = io.StringIO() + self.feed(html) + self.result.seek(0) + + def handle_starttag(self, tag, attrs): + if tag in Sanitizer.safe: + self.result.write(u"<" + tag) + if attrs: + self.result.write(" " + Sanitizer.format(attrs)) + self.result.write(u">") + + def handle_data(self, data): + self.result.write(data) + + def handle_endtag(self, tag): + if tag in Sanitizer.safe: + self.result.write(u"</" + tag + ">") + + def handle_startendtag(self, tag, attrs): + if tag in Sanitizer.safe: + self.result.write(u"<" + tag) + if attrs: + self.result.write(" " + Sanitizer.format(attrs)) + self.result.write(u"/>") + + def handle_entityref(self, name): + self.result.write(u'&' + name + ';') + + def handle_charref(self, char): + self.result.write(u'&#' + char + ';') + + def markdown(text): """Convert Markdown to (safe) HTML. >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE '<p><em>Ohai!</em></p>' + >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE + '<p><em>Hi</em></p>' >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE - '<p>alert('Onoe')</p>' + "<p>alert('Onoe')</p>" >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>' """ @@ -141,9 +204,13 @@ def markdown(text): exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK # remove HTML tags, skip <img> (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_HTML | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK - return misaka.html(text, extensions=exts, render_flags=html).strip("\n") + rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") + if not rv.startswith("<p>") and not rv.endswith("</p>"): + rv = "<p>" + rv + "</p>" + + return Sanitizer(rv).result.read() def origin(hosts): diff --git a/specs/test_comments.py b/specs/test_comments.py index cb374d4..4a21de5 100644 --- a/specs/test_comments.py +++ b/specs/test_comments.py @@ -54,7 +54,7 @@ class TestComments(unittest.TestCase): rv = loads(r.data) assert rv['id'] == 1 - assert rv['text'] == '<p>Lorem ipsum ...</p>\n' + assert rv['text'] == '<p>Lorem ipsum ...</p>' def testCreate(self): @@ -66,7 +66,7 @@ class TestComments(unittest.TestCase): rv = loads(rv.data) assert rv["mode"] == 1 - assert rv["text"] == '<p>Lorem ipsum ...</p>\n' + assert rv["text"] == '<p>Lorem ipsum ...</p>' def textCreateWithNonAsciiText(self): @@ -78,7 +78,7 @@ class TestComments(unittest.TestCase): rv = loads(rv.data) assert rv["mode"] == 1 - assert rv["text"] == '<p>Здравствуй, мир!</p>\n' + assert rv["text"] == '<p>Здравствуй, мир!</p>' def testCreateMultiple(self): @@ -261,10 +261,10 @@ class TestComments(unittest.TestCase): self.post('/new?uri=test', data=json.dumps({"text": "Tpyo"})) self.put('/id/1', data=json.dumps({"text": "Tyop"})) - assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>\n" + assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>" self.put('/id/1', data=json.dumps({"text": "Typo"})) - assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>\n" + assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>" def testDeleteCommentRemovesThread(self):