diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py
index 5d4ec7f..6c94f15 100644
--- a/isso/utils/__init__.py
+++ b/isso/utils/__init__.py
@@ -14,7 +14,6 @@ try:
except ImportError:
from HTMLParser import HTMLParser, HTMLParseError
-from werkzeug.utils import escape
from werkzeug.wrappers import Request, Response
from werkzeug.exceptions import BadRequest
@@ -23,8 +22,6 @@ try:
except ImportError:
import ipaddr as ipaddress
-import misaka
-
def anonymize(remote_addr):
"""
@@ -127,86 +124,6 @@ class JSONResponse(Response):
json.dumps(obj).encode("utf-8"), *args, **kwargs)
-class Sanitizer(HTMLParser, object):
- """Sanitize HTML output: remove unsafe HTML tags such as iframe or
- script based on a whitelist of allowed tags."""
-
- safe = set([
- "p", "a", "pre", "blockquote",
- "h1", "h2", "h3", "h4", "h5", "h6",
- "em", "sub", "sup", "del", "ins", "math",
- "dl", "ol", "ul", "li"])
-
- @classmethod
- def format(cls, attrs):
- res = []
- for key, value in attrs:
- if value is None:
- res.append(key)
- else:
- res.append(u'{0}="{1}"'.format(key, escape(value)))
- return ' '.join(res)
-
- def __init__(self, html):
- super(Sanitizer, self).__init__()
- self.result = io.StringIO()
- self.feed(html)
- self.result.seek(0)
-
- def handle_starttag(self, tag, attrs):
- if tag in Sanitizer.safe:
- self.result.write(u"<" + tag)
- if attrs:
- self.result.write(" " + Sanitizer.format(attrs))
- self.result.write(u">")
-
- def handle_data(self, data):
- self.result.write(data)
-
- def handle_endtag(self, tag):
- if tag in Sanitizer.safe:
- self.result.write(u"" + tag + ">")
-
- def handle_startendtag(self, tag, attrs):
- if tag in Sanitizer.safe:
- self.result.write(u"<" + tag)
- if attrs:
- self.result.write(" " + Sanitizer.format(attrs))
- self.result.write(u"/>")
-
- def handle_entityref(self, name):
- self.result.write(u'&' + name + ';')
-
- def handle_charref(self, char):
- self.result.write(u'' + char + ';')
-
-
-def markdown(text):
- """Convert Markdown to (safe) HTML.
-
- >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
- '
Ohai!
'
- >>> markdown("Hi") # doctest: +IGNORE_UNICODE
- 'Hi
'
- >>> markdown("") # doctest: +IGNORE_UNICODE
- "alert('Onoe')
"
- >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
- 'http://example.org/ and sms:+1234567890
'
- """
-
- # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
- exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
-
- # remove HTML tags, skip
(for now) and only render "safe" protocols
- html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
-
- rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
- if not rv.startswith("") and not rv.endswith("
"):
- rv = "" + rv + "
"
-
- return Sanitizer(rv).result.read()
-
-
def origin(hosts):
hosts = [x.rstrip("/") for x in hosts]
diff --git a/isso/utils/html.py b/isso/utils/html.py
new file mode 100644
index 0000000..df60ad8
--- /dev/null
+++ b/isso/utils/html.py
@@ -0,0 +1,66 @@
+# -*- encoding: utf-8 -*-
+
+import html5lib
+
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer import HTMLSerializer
+from html5lib.treewalkers import getTreeWalker
+
+import misaka
+
+
+class MarkdownSanitizer(HTMLSanitizer):
+
+ # attributes found in Sundown's HTML serializer [1] except for
tag,
+ # because images are not generated anyways.
+ #
+ # [1] https://github.com/vmg/sundown/blob/master/html/html.c
+ allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
+ "pre", "code", "blockquote",
+ "del", "ins", "strong", "em",
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "table", "thead", "tbody", "th", "td"]
+
+ # href for and align for
+ allowed_attributes = ["align", "href"]
+
+ # remove disallowed tokens from the output
+ def disallowed_token(self, token, token_type):
+ return None
+
+
+def sanitize(document):
+
+ parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
+ domtree = parser.parseFragment(document)
+
+ stream = html5lib.treewalkers.getTreeWalker('etree')(domtree)
+ serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)
+
+ return serializer.render(stream)
+
+
+def markdown(text):
+ """Convert Markdown to (safe) HTML.
+
+ >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
+ 'Ohai!
'
+ >>> markdown("Hi") # doctest: +IGNORE_UNICODE
+ 'Hi
'
+ >>> markdown("") # doctest: +IGNORE_UNICODE
+ "alert('Onoe')
"
+ >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
+ 'http://example.org/ and sms:+1234567890
'
+ """
+
+ # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
+ exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK
+
+ # remove HTML tags, skip
(for now) and only render "safe" protocols
+ html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
+
+ rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
+ if not rv.startswith("") and not rv.endswith("
"):
+ rv = "" + rv + "
"
+
+ return sanitize(rv)
diff --git a/isso/views/comments.py b/isso/views/comments.py
index 492e99c..1262170 100644
--- a/isso/views/comments.py
+++ b/isso/views/comments.py
@@ -16,7 +16,7 @@ from werkzeug.exceptions import BadRequest, Forbidden, NotFound
from isso.compat import text_type as str
from isso import utils, local
-from isso.utils import http, parse, markdown, JSONResponse as JSON
+from isso.utils import http, parse, html, JSONResponse as JSON
from isso.utils.crypto import pbkdf2
from isso.views import requires
@@ -163,7 +163,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
- rv["text"] = markdown(rv["text"])
+ rv["text"] = html.markdown(rv["text"])
rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8")
self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash'])
@@ -189,7 +189,7 @@ class API(object):
rv.pop(key)
if request.args.get('plain', '0') == '0':
- rv['text'] = markdown(rv['text'])
+ rv['text'] = html.markdown(rv['text'])
return JSON(rv, 200)
@@ -230,7 +230,7 @@ class API(object):
value=self.isso.sign([rv["id"], sha1(rv["text"])]),
max_age=self.conf.getint('max-age'))
- rv["text"] = markdown(rv["text"])
+ rv["text"] = html.markdown(rv["text"])
resp = JSON(rv, 200)
resp.headers.add("Set-Cookie", cookie(str(rv["id"])))
@@ -336,7 +336,7 @@ class API(object):
if request.args.get('plain', '0') == '0':
for item in rv:
- item['text'] = markdown(item['text'])
+ item['text'] = html.markdown(item['text'])
return JSON(rv, 200)