From 3713d5e8eea8006c2b912810af40d2045e1f0eeb Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Sun, 12 Jan 2014 12:52:33 +0100 Subject: [PATCH] allow raw HTML markup for a few (whitelisted) tags To be compatible with comments from Disqus (and users unfamiliar with Markdown), Misaka no longer disables user-inputted HTML, but the generated HTML is now post-processed and all "unsafe" tags (not possible with Markdown) are discarded. Whitelist: p, a, pre, blockquote, h1-h6, em, sub, sup, del, ins, math, dl, ol, ul, li This commit also removes an unnecessary newline generated by Misaka/Sundown. --- isso/utils/__init__.py | 73 ++++++++++++++++++++++++++++++++++++++++-- specs/test_comments.py | 10 +++--- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py index a8327c3..5d4ec7f 100644 --- a/isso/utils/__init__.py +++ b/isso/utils/__init__.py @@ -5,9 +5,16 @@ from __future__ import division import pkg_resources werkzeug = pkg_resources.get_distribution("werkzeug") +import io import json import hashlib +try: + from html.parser import HTMLParser, HTMLParseError +except ImportError: + from HTMLParser import HTMLParser, HTMLParseError + +from werkzeug.utils import escape from werkzeug.wrappers import Request, Response from werkzeug.exceptions import BadRequest @@ -120,13 +127,69 @@ class JSONResponse(Response): json.dumps(obj).encode("utf-8"), *args, **kwargs) +class Sanitizer(HTMLParser, object): + """Sanitize HTML output: remove unsafe HTML tags such as iframe or + script based on a whitelist of allowed tags.""" + + safe = set([ + "p", "a", "pre", "blockquote", + "h1", "h2", "h3", "h4", "h5", "h6", + "em", "sub", "sup", "del", "ins", "math", + "dl", "ol", "ul", "li"]) + + @classmethod + def format(cls, attrs): + res = [] + for key, value in attrs: + if value is None: + res.append(key) + else: + res.append(u'{0}="{1}"'.format(key, escape(value))) + return ' '.join(res) + + def __init__(self, html): + super(Sanitizer, self).__init__() + self.result = io.StringIO() + self.feed(html) + self.result.seek(0) + + def handle_starttag(self, tag, attrs): + if tag in Sanitizer.safe: + self.result.write(u"<" + tag) + if attrs: + self.result.write(" " + Sanitizer.format(attrs)) + self.result.write(u">") + + def handle_data(self, data): + self.result.write(data) + + def handle_endtag(self, tag): + if tag in Sanitizer.safe: + self.result.write(u"") + + def handle_startendtag(self, tag, attrs): + if tag in Sanitizer.safe: + self.result.write(u"<" + tag) + if attrs: + self.result.write(" " + Sanitizer.format(attrs)) + self.result.write(u"/>") + + def handle_entityref(self, name): + self.result.write(u'&' + name + ';') + + def handle_charref(self, char): + self.result.write(u'&#' + char + ';') + + def markdown(text): """Convert Markdown to (safe) HTML. >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE '

Ohai!

' + >>> markdown("Hi") # doctest: +IGNORE_UNICODE + '

Hi

' >>> markdown("") # doctest: +IGNORE_UNICODE - '

alert('Onoe')

' + "

alert('Onoe')

" >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE '

http://example.org/ and sms:+1234567890

' """ @@ -135,9 +198,13 @@ def markdown(text): exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK # remove HTML tags, skip (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_HTML | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + + rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") + if not rv.startswith("

") and not rv.endswith("

"): + rv = "

" + rv + "

" - return misaka.html(text, extensions=exts, render_flags=html).strip("\n") + return Sanitizer(rv).result.read() def origin(hosts): diff --git a/specs/test_comments.py b/specs/test_comments.py index 10704d8..7e19911 100644 --- a/specs/test_comments.py +++ b/specs/test_comments.py @@ -54,7 +54,7 @@ class TestComments(unittest.TestCase): rv = loads(r.data) assert rv['id'] == 1 - assert rv['text'] == '

Lorem ipsum ...

\n' + assert rv['text'] == '

Lorem ipsum ...

' def testCreate(self): @@ -66,7 +66,7 @@ class TestComments(unittest.TestCase): rv = loads(rv.data) assert rv["mode"] == 1 - assert rv["text"] == '

Lorem ipsum ...

\n' + assert rv["text"] == '

Lorem ipsum ...

' def textCreateWithNonAsciiText(self): @@ -78,7 +78,7 @@ class TestComments(unittest.TestCase): rv = loads(rv.data) assert rv["mode"] == 1 - assert rv["text"] == '

Здравствуй, мир!

\n' + assert rv["text"] == '

Здравствуй, мир!

' def testCreateMultiple(self): @@ -262,10 +262,10 @@ class TestComments(unittest.TestCase): self.post('/new?uri=test', data=json.dumps({"text": "Tpyo"})) self.put('/id/1', data=json.dumps({"text": "Tyop"})) - assert loads(self.get('/id/1').data)["text"] == "

Tyop

\n" + assert loads(self.get('/id/1').data)["text"] == "

Tyop

" self.put('/id/1', data=json.dumps({"text": "Typo"})) - assert loads(self.get('/id/1').data)["text"] == "

Typo

\n" + assert loads(self.get('/id/1').data)["text"] == "

Typo

" def testDeleteCommentRemovesThread(self):