isso/isso/utils/html.py

# -*- encoding: utf-8 -*-

import pkg_resources
import operator

from isso.compat import reduce

import html5lib
html5lib_version = pkg_resources.get_distribution("html5lib").version

from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer

import misaka


def Sanitizer(elements, attributes):

    class Inner(HTMLSanitizer):

        # attributes found in Sundown's HTML serializer [1] except for <img> tag,
        # because images are not generated anyways.
        #
        # [1] https://github.com/vmg/sundown/blob/master/html/html.c
        allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
                            "pre", "code", "blockquote",
                            "del", "ins", "strong", "em",
                            "h1", "h2", "h3", "h4", "h5", "h6",
                            "table", "thead", "tbody", "th", "td"] + elements

        # href for <a> and align for <table>
        allowed_attributes = ["align", "href"] + attributes

        # remove disallowed tokens from the output
        def disallowed_token(self, token, token_type):
            return None

    return Inner


def sanitize(tokenizer, document):

    parser = html5lib.HTMLParser(tokenizer=tokenizer)
    domtree = parser.parseFragment(document)

    builder = "simpletree" if html5lib_version == "0.95" else "etree"
    stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
    serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    return serializer.render(stream)


def Markdown(extensions=("strikethrough", "superscript", "autolink")):

    flags = reduce(operator.xor, map(
        lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)

    def inner(text):
        rv = misaka.html(text, extensions=flags).rstrip("\n")
        if not rv.endswith("<p>") and not rv.endswith("</p>"):
            return "<p>" + rv + "</p>"
        return rv

    return inner


class Markup(object):

    def __init__(self, conf):

        parser = Markdown(conf.getlist("options"))
        sanitizer = Sanitizer(
            conf.getlist("allowed-elements"),
            conf.getlist("allowed-attributes"))

        self._render = lambda text: sanitize(sanitizer, parser(text))

    def render(self, text):
        return self._render(text)
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`# -- encoding: utf-8 --`

add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 10 years ago			`import pkg_resources`
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`import operator`

			`from isso.compat import reduce`
add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 10 years ago
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`import html5lib`
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`html5lib_version = pkg_resources.get_distribution("html5lib").version`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
			`from html5lib.sanitizer import HTMLSanitizer`
			`from html5lib.serializer import HTMLSerializer`

			`import misaka`


refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`def Sanitizer(elements, attributes):`

			`class Inner(HTMLSanitizer):`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`# attributes found in Sundown's HTML serializer [1] except for <img> tag,`
			`# because images are not generated anyways.`
			`#`
			`# [1] https://github.com/vmg/sundown/blob/master/html/html.c`
			`allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",`
			`"pre", "code", "blockquote",`
			`"del", "ins", "strong", "em",`
			`"h1", "h2", "h3", "h4", "h5", "h6",`
			`"table", "thead", "tbody", "th", "td"] + elements`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`# href for <a> and align for <table>`
			`allowed_attributes = ["align", "href"] + attributes`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`# remove disallowed tokens from the output`
			`def disallowed_token(self, token, token_type):`
			`return None`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`return Inner`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago

refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`def sanitize(tokenizer, document):`

			`parser = html5lib.HTMLParser(tokenizer=tokenizer)`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`domtree = parser.parseFragment(document)`

refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`builder = "simpletree" if html5lib_version == "0.95" else "etree"`
add support for html5lib==0.95, fixes #60 The python-html5lib package in Debian Wheezy does not support `etree` as tree builder (called `simpletree` back then). 10 years ago			`stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago			`serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)`

			`return serializer.render(stream)`


refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`def Markdown(extensions=("strikethrough", "superscript", "autolink")):`

			`flags = reduce(operator.xor, map(`
			`lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0)`

			`def inner(text):`
			`rv = misaka.html(text, extensions=flags).rstrip("\n")`
			`if not rv.endswith("<p>") and not rv.endswith("</p>"):`
			`return "<p>" + rv + "</p>"`
			`return rv`

			`return inner`

use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`class Markup(object):`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`def __init__(self, conf):`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`parser = Markdown(conf.getlist("options"))`
			`sanitizer = Sanitizer(`
			`conf.getlist("allowed-elements"),`
			`conf.getlist("allowed-attributes"))`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`self._render = lambda text: sanitize(sanitizer, parser(text))`
use html5lib's sanitizer, supersedes 3713d5e Python's HTMLParser is smart enough to filter malicious tags but fails to repair invalid, user-inputted HTML. Instead of re-inventing the wheel, Isso now uses html5lib's HTMLSanitizer with a whitelist of all tags generated by Sundown. Disallowed tags are discarded from the output to match the previous unittests. This feature is only available for html5lib 0.99(9) and later. Earlier releases just escape disallowed tags. 11 years ago
refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. 10 years ago			`def render(self, text):`
			`return self._render(text)`