diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py index 5d4ec7f..6c94f15 100644 --- a/isso/utils/__init__.py +++ b/isso/utils/__init__.py @@ -14,7 +14,6 @@ try: except ImportError: from HTMLParser import HTMLParser, HTMLParseError -from werkzeug.utils import escape from werkzeug.wrappers import Request, Response from werkzeug.exceptions import BadRequest @@ -23,8 +22,6 @@ try: except ImportError: import ipaddr as ipaddress -import misaka - def anonymize(remote_addr): """ @@ -127,86 +124,6 @@ class JSONResponse(Response): json.dumps(obj).encode("utf-8"), *args, **kwargs) -class Sanitizer(HTMLParser, object): - """Sanitize HTML output: remove unsafe HTML tags such as iframe or - script based on a whitelist of allowed tags.""" - - safe = set([ - "p", "a", "pre", "blockquote", - "h1", "h2", "h3", "h4", "h5", "h6", - "em", "sub", "sup", "del", "ins", "math", - "dl", "ol", "ul", "li"]) - - @classmethod - def format(cls, attrs): - res = [] - for key, value in attrs: - if value is None: - res.append(key) - else: - res.append(u'{0}="{1}"'.format(key, escape(value))) - return ' '.join(res) - - def __init__(self, html): - super(Sanitizer, self).__init__() - self.result = io.StringIO() - self.feed(html) - self.result.seek(0) - - def handle_starttag(self, tag, attrs): - if tag in Sanitizer.safe: - self.result.write(u"<" + tag) - if attrs: - self.result.write(" " + Sanitizer.format(attrs)) - self.result.write(u">") - - def handle_data(self, data): - self.result.write(data) - - def handle_endtag(self, tag): - if tag in Sanitizer.safe: - self.result.write(u"") - - def handle_startendtag(self, tag, attrs): - if tag in Sanitizer.safe: - self.result.write(u"<" + tag) - if attrs: - self.result.write(" " + Sanitizer.format(attrs)) - self.result.write(u"/>") - - def handle_entityref(self, name): - self.result.write(u'&' + name + ';') - - def handle_charref(self, char): - self.result.write(u'&#' + char + ';') - - -def markdown(text): - """Convert Markdown to (safe) HTML. - - >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE - '

Ohai!

' - >>> markdown("Hi") # doctest: +IGNORE_UNICODE - '

Hi

' - >>> markdown("") # doctest: +IGNORE_UNICODE - "

alert('Onoe')

" - >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE - '

http://example.org/ and sms:+1234567890

' - """ - - # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link - exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK - - # remove HTML tags, skip (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK - - rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") - if not rv.startswith("

") and not rv.endswith("

"): - rv = "

" + rv + "

" - - return Sanitizer(rv).result.read() - - def origin(hosts): hosts = [x.rstrip("/") for x in hosts] diff --git a/isso/utils/html.py b/isso/utils/html.py new file mode 100644 index 0000000..df60ad8 --- /dev/null +++ b/isso/utils/html.py @@ -0,0 +1,66 @@ +# -*- encoding: utf-8 -*- + +import html5lib + +from html5lib.sanitizer import HTMLSanitizer +from html5lib.serializer import HTMLSerializer +from html5lib.treewalkers import getTreeWalker + +import misaka + + +class MarkdownSanitizer(HTMLSanitizer): + + # attributes found in Sundown's HTML serializer [1] except for tag, + # because images are not generated anyways. + # + # [1] https://github.com/vmg/sundown/blob/master/html/html.c + allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "td"] + + # href for and align for + allowed_attributes = ["align", "href"] + + # remove disallowed tokens from the output + def disallowed_token(self, token, token_type): + return None + + +def sanitize(document): + + parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer) + domtree = parser.parseFragment(document) + + stream = html5lib.treewalkers.getTreeWalker('etree')(domtree) + serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) + + return serializer.render(stream) + + +def markdown(text): + """Convert Markdown to (safe) HTML. + + >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE + '

Ohai!

' + >>> markdown("Hi") # doctest: +IGNORE_UNICODE + '

Hi

' + >>> markdown("") # doctest: +IGNORE_UNICODE + "

alert('Onoe')

" + >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE + '

http://example.org/ and sms:+1234567890

' + """ + + # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link + exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK + + # remove HTML tags, skip (for now) and only render "safe" protocols + html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + + rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") + if not rv.startswith("

") and not rv.endswith("

"): + rv = "

" + rv + "

" + + return sanitize(rv) diff --git a/isso/views/comments.py b/isso/views/comments.py index 492e99c..1262170 100644 --- a/isso/views/comments.py +++ b/isso/views/comments.py @@ -16,7 +16,7 @@ from werkzeug.exceptions import BadRequest, Forbidden, NotFound from isso.compat import text_type as str from isso import utils, local -from isso.utils import http, parse, markdown, JSONResponse as JSON +from isso.utils import http, parse, html, JSONResponse as JSON from isso.utils.crypto import pbkdf2 from isso.views import requires @@ -163,7 +163,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = markdown(rv["text"]) + rv["text"] = html.markdown(rv["text"]) rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8") self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash']) @@ -189,7 +189,7 @@ class API(object): rv.pop(key) if request.args.get('plain', '0') == '0': - rv['text'] = markdown(rv['text']) + rv['text'] = html.markdown(rv['text']) return JSON(rv, 200) @@ -230,7 +230,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = markdown(rv["text"]) + rv["text"] = html.markdown(rv["text"]) resp = JSON(rv, 200) resp.headers.add("Set-Cookie", cookie(str(rv["id"]))) @@ -336,7 +336,7 @@ class API(object): if request.args.get('plain', '0') == '0': for item in rv: - item['text'] = markdown(item['text']) + item['text'] = html.markdown(item['text']) return JSON(rv, 200)