isso/isso/utils/html.py

# -*- encoding: utf-8 -*-

import pkg_resources

import html5lib
setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version)

from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer
from html5lib.treewalkers import getTreeWalker

import misaka


class MarkdownSanitizer(HTMLSanitizer):

    # attributes found in Sundown's HTML serializer [1] except for <img> tag,
    # because images are not generated anyways.
    #
    # [1] https://github.com/vmg/sundown/blob/master/html/html.c
    allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
                        "pre", "code", "blockquote",
                        "del", "ins", "strong", "em",
                        "h1", "h2", "h3", "h4", "h5", "h6",
                        "table", "thead", "tbody", "th", "td"]

    # href for <a> and align for <table>
    allowed_attributes = ["align", "href"]

    # remove disallowed tokens from the output
    def disallowed_token(self, token, token_type):
        return None


def sanitize(document):

    parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)
    domtree = parser.parseFragment(document)

    builder = "simpletree" if html5lib.version == "0.95" else "etree"
    stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    return serializer.render(stream)


def markdown(text):
    """Convert Markdown to (safe) HTML.

    >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
    '<p><em>Ohai!</em></p>'
    >>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
    '<p><em>Hi</em></p>'
    >>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
    "<p>alert('Onoe')</p>"
    >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
    '<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
    """

    # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link
    exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK

    # remove HTML tags, skip <img> (for now) and only render "safe" protocols
    html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK

    rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
    if not rv.startswith("<p>") and not rv.endswith("</p>"):
        rv = "<p>" + rv + "</p>"

    return sanitize(rv)
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`# -- encoding: utf-8 --`

add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 11 years ago			`import pkg_resources`

use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`import html5lib`
add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 11 years ago			`setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version)`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
			`from html5lib.sanitizer import HTMLSanitizer`
			`from html5lib.serializer import HTMLSerializer`
			`from html5lib.treewalkers import getTreeWalker`

			`import misaka`


			`class MarkdownSanitizer(HTMLSanitizer):`

			`# attributes found in Sundown's HTML serializer [1] except for <img> tag,`
			`# because images are not generated anyways.`
			`#`
			`# [1] https://github.com/vmg/sundown/blob/master/html/html.c`
			`allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",`
			`"pre", "code", "blockquote",`
			`"del", "ins", "strong", "em",`
			`"h1", "h2", "h3", "h4", "h5", "h6",`
			`"table", "thead", "tbody", "th", "td"]`

			`# href for <a> and align for <table>`
			`allowed_attributes = ["align", "href"]`

			`# remove disallowed tokens from the output`
			`def disallowed_token(self, token, token_type):`
			`return None`


			`def sanitize(document):`

			`parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer)`
			`domtree = parser.parseFragment(document)`

add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 11 years ago			`builder = "simpletree" if html5lib.version == "0.95" else "etree"`
			`stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)`

			`return serializer.render(stream)`


			`def markdown(text):`
			`"""Convert Markdown to (safe) HTML.`

			`>>> markdown("Ohai!") # doctest: +IGNORE_UNICODE`
			`'<p><em>Ohai!</em></p>'`
			`>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE`
			`'<p><em>Hi</em></p>'`
			`>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE`
			`"<p>alert('Onoe')</p>"`
			`>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE`
			`'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'`
			`"""`

			`# ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link`
			`exts = misaka.EXT_STRIKETHROUGH \| misaka.EXT_SUPERSCRIPT \| misaka.EXT_AUTOLINK`

			`# remove HTML tags, skip <img> (for now) and only render "safe" protocols`
			`html = misaka.HTML_SKIP_STYLE \| misaka.HTML_SKIP_IMAGES \| misaka.HTML_SAFELINK`

			`rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")`
			`if not rv.startswith("<p>") and not rv.endswith("</p>"):`
			`rv = "<p>" + rv + "</p>"`

			`return sanitize(rv)`