From 6071a857874475069b521ee51aed5b1bf0c67607 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Tue, 18 Feb 2014 16:51:04 +0100 Subject: [PATCH 1/4] add `Config.getlist` method --- isso/core.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/isso/core.py b/isso/core.py index 4b1e872..bb0c1eb 100644 --- a/isso/core.py +++ b/isso/core.py @@ -44,6 +44,9 @@ class Section: def getint(self, key): return self.conf.getint(self.section, key) + def getlist(self, key): + return self.conf.getlist(self.section, key) + def getiter(self, key): return self.conf.getiter(self.section, key) @@ -62,6 +65,7 @@ class IssoParser(ConfigParser): ... [foo] ... bar = 1h ... baz = 12 + ... spam = a, b, cdef ... bla = ... spam ... ham @@ -71,6 +75,8 @@ class IssoParser(ConfigParser): 3600 >>> parser.getint("foo", "baz") 12 + >>> parser.getlist("foo", "spam") # doctest: +IGNORE_UNICODE + ['a', 'b', 'cdef'] >>> list(parser.getiter("foo", "bla")) # doctest: +IGNORE_UNICODE ['spam', 'ham'] >>> list(parser.getiter("foo", "asd")) # doctest: +IGNORE_UNICODE @@ -92,6 +98,9 @@ class IssoParser(ConfigParser): except AttributeError: return int(IssoParser._total_seconds(delta)) + def getlist(self, section, key): + return list(map(str.strip, self.get(section, key).split(','))) + def getiter(self, section, key): for item in map(str.strip, self.get(section, key).split('\n')): if item: From d93d77c8c7ef9deb022b5cdaff2abcf7a26aa6af Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Tue, 18 Feb 2014 16:52:26 +0100 Subject: [PATCH 2/4] refactor markup and sanitization code This commit introduces a new configuration section [markup] to refine Misaka's Markdown extensions (by default strikethrough, superscript and autolink). Furthermore, you can set custom HTML elements/attributes that are allowed, e.g. to enable images, set [markup] allowed-elements = img allowed-attributes = src The refactorization separates HTML sanitization from Markdown -> HTML and allows to include new markup languages such as BB Code or reStructuredText. --- isso/__init__.py | 6 ++- isso/compat.py | 2 + isso/core.py | 6 ++- isso/utils/html.py | 86 +++++++++++++++++++++++------------------- isso/views/comments.py | 8 ++-- 5 files changed, 64 insertions(+), 44 deletions(-) diff --git a/isso/__init__.py b/isso/__init__.py index c77b07e..6d48af4 100644 --- a/isso/__init__.py +++ b/isso/__init__.py @@ -64,7 +64,7 @@ local_manager = LocalManager([local]) from isso import db, migrate, wsgi, ext, views from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config -from isso.utils import parse, http, JSONRequest, origin +from isso.utils import parse, http, JSONRequest, origin, html from isso.views import comments from isso.ext.notifications import Stdout, SMTP @@ -86,6 +86,7 @@ class Isso(object): self.conf = conf self.db = db.SQLite3(conf.get('general', 'dbpath'), conf) self.signer = URLSafeTimedSerializer(conf.get('general', 'session-key')) + self.markup = html.Markup(conf.section('markup')) super(Isso, self).__init__(conf) @@ -102,6 +103,9 @@ class Isso(object): views.Info(self) comments.API(self) + def render(self, text): + return self.markup.render(text) + def sign(self, obj): return self.signer.dumps(obj) diff --git a/isso/compat.py b/isso/compat.py index 772d561..ac09826 100644 --- a/isso/compat.py +++ b/isso/compat.py @@ -6,6 +6,7 @@ PY2K = sys.version_info[0] == 2 if not PY2K: map, zip, filter = map, zip, filter + from functools import reduce text_type = str string_types = (str, ) @@ -15,6 +16,7 @@ else: from itertools import imap, izip, ifilter map, zip, filter = imap, izip, ifilter + reduce = reduce text_type = unicode string_types = (str, unicode) diff --git a/isso/core.py b/isso/core.py index bb0c1eb..ce2abf2 100644 --- a/isso/core.py +++ b/isso/core.py @@ -132,7 +132,11 @@ class Config: "enabled = true", "ratelimit = 2", "direct-reply = 3", - "reply-to-self = false" + "reply-to-self = false", + "[markup]", + "options = strikethrough, superscript, autolink", + "allowed-elements = ", + "allowed-attributes = " ] @classmethod diff --git a/isso/utils/html.py b/isso/utils/html.py index 34a4dc7..5650bfd 100644 --- a/isso/utils/html.py +++ b/isso/utils/html.py @@ -1,9 +1,12 @@ # -*- encoding: utf-8 -*- import pkg_resources +import operator + +from isso.compat import reduce import html5lib -setattr(html5lib, "version", pkg_resources.get_distribution("html5lib").version) +html5lib_version = pkg_resources.get_distribution("html5lib").version from html5lib.sanitizer import HTMLSanitizer from html5lib.serializer import HTMLSerializer @@ -12,59 +15,66 @@ from html5lib.treewalkers import getTreeWalker import misaka -class MarkdownSanitizer(HTMLSanitizer): +def Sanitizer(elements, attributes): - # attributes found in Sundown's HTML serializer [1] except for tag, - # because images are not generated anyways. - # - # [1] https://github.com/vmg/sundown/blob/master/html/html.c - allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", - "pre", "code", "blockquote", - "del", "ins", "strong", "em", - "h1", "h2", "h3", "h4", "h5", "h6", - "table", "thead", "tbody", "th", "td"] + class Inner(HTMLSanitizer): - # href for and align for - allowed_attributes = ["align", "href"] + # attributes found in Sundown's HTML serializer [1] except for tag, + # because images are not generated anyways. + # + # [1] https://github.com/vmg/sundown/blob/master/html/html.c + allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "table", "thead", "tbody", "th", "td"] + elements - # remove disallowed tokens from the output - def disallowed_token(self, token, token_type): - return None + # href for and align for
+ allowed_attributes = ["align", "href"] + attributes + + # remove disallowed tokens from the output + def disallowed_token(self, token, token_type): + return None + + return Inner -def sanitize(document): +def sanitize(tokenizer, document): - parser = html5lib.HTMLParser(tokenizer=MarkdownSanitizer) + parser = html5lib.HTMLParser(tokenizer=tokenizer) domtree = parser.parseFragment(document) - builder = "simpletree" if html5lib.version == "0.95" else "etree" + builder = "simpletree" if html5lib_version == "0.95" else "etree" stream = html5lib.treewalkers.getTreeWalker(builder)(domtree) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream) -def markdown(text): - """Convert Markdown to (safe) HTML. +def Markdown(extensions=("strikethrough", "superscript", "autolink")): - >>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE - '

Ohai!

' - >>> markdown("Hi") # doctest: +IGNORE_UNICODE - '

Hi

' - >>> markdown("") # doctest: +IGNORE_UNICODE - "

alert('Onoe')

" - >>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE - '

http://example.org/ and sms:+1234567890

' - """ + flags = reduce(operator.xor, map( + lambda ext: getattr(misaka, 'EXT_' + ext.upper()), extensions), 0) - # ~~strike through~~, sub script: 2^(nd) and http://example.org/ auto-link - exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK + def inner(text): + rv = misaka.html(text, extensions=flags).rstrip("\n") + if not rv.endswith("

") and not rv.endswith("

"): + return "

" + rv + "

" + return rv - # remove HTML tags, skip (for now) and only render "safe" protocols - html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK + return inner - rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n") - if not rv.startswith("

") and not rv.endswith("

"): - rv = "

" + rv + "

" - return sanitize(rv) +class Markup(object): + + def __init__(self, conf): + + parser = Markdown(conf.getlist("options")) + sanitizer = Sanitizer( + conf.getlist("allowed-elements"), + conf.getlist("allowed-attributes")) + + self._render = lambda text: sanitize(sanitizer, parser(text)) + + def render(self, text): + return self._render(text) diff --git a/isso/views/comments.py b/isso/views/comments.py index 1262170..c231ead 100644 --- a/isso/views/comments.py +++ b/isso/views/comments.py @@ -163,7 +163,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = html.markdown(rv["text"]) + rv["text"] = self.isso.render(rv["text"]) rv["hash"] = pbkdf2(rv['email'] or rv['remote_addr'], self.isso.salt, 1000, 6).decode("utf-8") self.cache.set('hash', (rv['email'] or rv['remote_addr']).encode('utf-8'), rv['hash']) @@ -189,7 +189,7 @@ class API(object): rv.pop(key) if request.args.get('plain', '0') == '0': - rv['text'] = html.markdown(rv['text']) + rv['text'] = self.isso.render(rv['text']) return JSON(rv, 200) @@ -230,7 +230,7 @@ class API(object): value=self.isso.sign([rv["id"], sha1(rv["text"])]), max_age=self.conf.getint('max-age')) - rv["text"] = html.markdown(rv["text"]) + rv["text"] = self.isso.render(rv["text"]) resp = JSON(rv, 200) resp.headers.add("Set-Cookie", cookie(str(rv["id"]))) @@ -336,7 +336,7 @@ class API(object): if request.args.get('plain', '0') == '0': for item in rv: - item['text'] = html.markdown(item['text']) + item['text'] = self.isso.render(item['text']) return JSON(rv, 200) From 8f70a3a7cbb2e02d990fc9bd398e944b826d52e3 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Tue, 18 Feb 2014 16:58:37 +0100 Subject: [PATCH 3/4] add tests for 81ecc8e --- specs/test_html.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 specs/test_html.py diff --git a/specs/test_html.py b/specs/test_html.py new file mode 100644 index 0000000..f03d4c2 --- /dev/null +++ b/specs/test_html.py @@ -0,0 +1,60 @@ + +try: + import unittest2 as unittest +except ImportError: + import unittest + + +from isso.core import Config +from isso.utils import html + + +class TestHTML(unittest.TestCase): + + def test_markdown(self): + convert = html.Markdown(extensions=()) + examples = [ + ("*Ohai!*", "

Ohai!

"), + ("Hi", "

Hi

"), + ("http://example.org/", '

http://example.org/

')] + + for (input, expected) in examples: + self.assertEqual(convert(input), expected) + + def test_markdown_extensions(self): + convert = html.Markdown(extensions=("strikethrough", "superscript")) + examples = [ + ("~~strike~~ through", "

strike through

"), + ("sup^(script)", "

supscript

")] + + for (input, expected) in examples: + self.assertEqual(convert(input), expected) + + @unittest.skipIf(html.html5lib_version == "0.95", "backport") + def test_sanitizer(self): + sanitizer = html.Sanitizer(elements=[], attributes=[]) + examples = [ + ('Look: ', 'Look: '), + ('Ha', 'Ha'), + ('Ha', 'Ha'), + ('

Test

', '

Test

'), + ('', 'alert("Onoe")')] + + for (input, expected) in examples: + self.assertEqual(html.sanitize(sanitizer, input), expected) + + @unittest.skipIf(html.html5lib_version == "0.95", "backport") + def test_sanitizer_extensions(self): + sanitizer = html.Sanitizer(elements=["img"], attributes=["src"]) + examples = [ + ('', ''), + ('', '')] + + for (input, expected) in examples: + self.assertEqual(html.sanitize(sanitizer, input), expected) + + def test_render(self): + conf = Config.load(None).section("markup") + renderer = html.Markup(conf).render + self.assertEqual(renderer("http://example.org/ and sms:+1234567890"), + '

http://example.org/ and sms:+1234567890

') From c6214e31d77a87c7a5ff3fca0ba93726253b19ae Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Tue, 18 Feb 2014 17:30:37 +0100 Subject: [PATCH 4/4] document new [markup] section --- docs/_static/css/site.scss | 4 ++++ docs/docs/configuration/server.rst | 31 ++++++++++++++++++++++++++++++ docs/isso.example.cfg | 19 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/docs/_static/css/site.scss b/docs/_static/css/site.scss index 99a7b23..9c64a10 100644 --- a/docs/_static/css/site.scss +++ b/docs/_static/css/site.scss @@ -350,6 +350,10 @@ main { margin-left: 1.2em; } + dl { + margin-bottom: 0.4em; + } + .admonition { p + p { diff --git a/docs/docs/configuration/server.rst b/docs/docs/configuration/server.rst index de9ded7..13507b7 100644 --- a/docs/docs/configuration/server.rst +++ b/docs/docs/configuration/server.rst @@ -221,6 +221,37 @@ reply-to-self Do not forget to configure the client. +Markup +------ + +Customize markup and sanitized HTML. Currently, only Markdown (via Misaka) is +supported, but new languages are relatively easy to add. + +.. code-block:: ini + + [markup] + options = strikethrough, superscript, autolink + allowed-elements = + allowed-attributes = + +options + `Misaka-specific Markdown extensions `_, all + flags starting with `EXT_` can be used there, separated by comma. + +allowed-elements + Additional HTML tags to allow in the generated output, comma-separated. By + default, only *a*, *blockquote*, *br*, *code*, *del*, *em*, *h1*, *h2*, + *h3*, *h4*, *h5*, *h6*, *hr*, *ins*, *li*, *ol*, *p*, *pre*, *strong*, + *table*, *tbody*, *td*, *th*, *thead* and *ul* are allowed. + +allowed-attributes + Additional HTML attributes (independent from elements) to allow in the + generated output, comma-separated. By default, only *align* and *href* are + allowed. + +To allow images in comments, you just need to add ``allowed-elements = img`` and +``allowed-attributes = src``. + Appendum -------- diff --git a/docs/isso.example.cfg b/docs/isso.example.cfg index b17199d..a7bedf0 100644 --- a/docs/isso.example.cfg +++ b/docs/isso.example.cfg @@ -110,3 +110,22 @@ direct-reply = 3 # comment. After the editing timeframe is gone, commenters can reply to their # own comments anyways. Do not forget to configure the client. reply-to-self = false + + +[markup] +# Customize markup and sanitized HTML. Currently, only Markdown (via Misaka) is +# supported, but new languages are relatively easy to add. + +# Misaka-specific Markdown extensions, all flags starting with EXT_ can be used +# there, separated by comma. +options = strikethrough, superscript, autolink + +# Additional HTML tags to allow in the generated output, comma-separated. By +# default, only a, blockquote, br, code, del, em, h1, h2, h3, h4, h5, h6, hr, +# ins, li, ol, p, pre, strong, table, tbody, td, th, thead and ul are allowed. +allowed-elements = + +# Additional HTML attributes (independent from elements) to allow in the +# generated output, comma-separated. By default, only align and href are +# allowed. +allowed-attributes =