diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py index 5d4ec7f..6c94f15 100644 --- a/isso/utils/__init__.py +++ b/isso/utils/__init__.py @@ -14,7 +14,6 @@ try: except ImportError: from HTMLParser import HTMLParser, HTMLParseError -from werkzeug.utils import escape from werkzeug.wrappers import Request, Response from werkzeug.exceptions import BadRequest @@ -23,8 +22,6 @@ try: except ImportError: import ipaddr as ipaddress -import misaka - def anonymize(remote_addr): """ @@ -127,86 +124,6 @@ class JSONResponse(Response): json.dumps(obj).encode("utf-8"), *args, **kwargs) -class Sanitizer(HTMLParser, object): - """Sanitize HTML output: remove unsafe HTML tags such as iframe or - script based on a whitelist of allowed tags.""" - - safe = set([ - "p", "a", "pre", "blockquote", - "h1", "h2", "h3", "h4", "h5", "h6", - "em", "sub", "sup", "del", "ins", "math", - "dl", "ol", "ul", "li"]) - - @classmethod - def format(cls, attrs): - res = [] - for key, value in attrs: - if value is None: - res.append(key) - else: - res.append(u'{0}="{1}"'.format(key, escape(value))) - return ' '.join(res) - - def __init__(self, html): - super(Sanitizer, self).__init__() - self.result = io.StringIO() - self.feed(html) - self.result.seek(0) - - def handle_starttag(self, tag, attrs): - if tag in Sanitizer.safe: - self.result.write(u"<" + tag) - if attrs: - self.result.write(" " + Sanitizer.format(attrs)) - self.result.write(u">") - - def handle_data(self, data): - self.result.write(data) - - def handle_endtag(self, tag): - if tag in Sanitizer.safe: - self.result.write(u"" + tag + ">") - - def handle_startendtag(self, tag, attrs): - if tag in Sanitizer.safe: - self.result.write(u"<" + tag) - if attrs: - self.result.write(" " + Sanitizer.format(attrs)) - self.result.write(u"/>") - - def handle_entityref(self, name): - self.result.write(u'&' + name + ';') - - def handle_charref(self, char): - self.result.write(u'' + char + ';') - - -def markdown(text): - """Convert Markdown to (safe) HTML. - - >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE - '
Ohai!
' - >>> markdown("Hi") # doctest: +IGNORE_UNICODE - 'Hi
' - >>> markdown("") # doctest: +IGNORE_UNICODE - "alert('Onoe')
" - >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE - 'http://example.org/ and sms:+1234567890
' - """ - - # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link - exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK - - # remove HTML tags, skip (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK - - rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") - if not rv.startswith("") and not rv.endswith("
"): - rv = "" + rv + "
" - - return Sanitizer(rv).result.read() - - def origin(hosts): hosts = [x.rstrip("/") for x in hosts] diff --git a/isso/utils/html.py b/isso/utils/html.py new file mode 100644 index 0000000..df60ad8 --- /dev/null +++ b/isso/utils/html.py @@ -0,0 +1,66 @@ +# -*- encoding: utf-8 -*- + +import html5lib + +from html5lib.sanitizer import HTMLSanitizer +from html5lib.serializer import HTMLSerializer +from html5lib.treewalkers import getTreeWalker + +import misaka + + +class MarkdownSanitizer(HTMLSanitizer): + + # attributes found in Sundown's HTML serializer [1] except for tag, + # because images are not generated anyways. + # + # [1] https://github.com/vmg/sundown/blob/master/html/html.c + allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "td"] + + # href for and align for