From 0d07515c189fc20f829a8efd2d6fe53f3cebc3d3 Mon Sep 17 00:00:00 2001
From: Martin Zimmermann <info@posativ.org>
Date: Sat, 2 Nov 2013 15:07:42 +0100
Subject: [PATCH] override HTML title parsing with a `data-title="...`
 attribute

If you don't use a <h1> to markup your post's title (but h2), it
is no longer possible to reliable detect the site's title.

E.g. you have a single page with only one <h1> and that's the
*real* title of that page. But on the other hand, it is also
possible, that the <h1> tag is just your website's name and the
actual post title is marked up in <h2>.
---
 README.md           | 35 +++++++++++++++++++++++++++++++++++
 isso/utils/parse.py | 19 ++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index a93a6a9..186d73e 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,41 @@ current comment count.
 This functionality is already included when you embed `embed.min.js`, do
 *not* mix `embed.min.js` and `count.min.js` in a single document.
 
+### Client Configuration
+
+You can configure the client (the JS part) via `data-` attributes:
+
+*   data-title
+
+    When you start a new thread (= first comment on a page), Isso sends
+    a GET request that page to see if it a) exists and b) parse the site's
+    heading (currently used as subject in emails).
+
+    Isso assumes that the title is inside an `h1` tag near the isso thread:
+
+    ```html
+    <html>
+        <body>
+            <h1>Website Title</h1>
+            <article>
+                <header>
+                    <h1>Post Title</h1>
+                <section id="isso-thread">
+            ...
+    ```
+
+    In this example, the detected title is `Post Title` as expected, but some
+    older sites may only use a  single `h1` as their website's maintitle, and
+    a `h2` for the post title. Unfortunately this is unambiguous and you have
+    to tell Isso what's the actual post title:
+
+    ```html
+    <section data-title="Post Title" id="isso-thread">
+    ```
+
+    Make sure to escape the attribute value.
+
+
 ### Webserver configuration
 
 *   nginx configuration to run Isso on `/isso`:
diff --git a/isso/utils/parse.py b/isso/utils/parse.py
index 8decd38..a03ba7b 100644
--- a/isso/utils/parse.py
+++ b/isso/utils/parse.py
@@ -7,9 +7,10 @@ import datetime
 from itertools import chain
 
 try:
+    from urllib import unquote
     from urlparse import urlparse
 except ImportError:
-    from urllib.parse import urlparse
+    from urllib.parse import urlparse, unquote
 
 import html5lib
 
@@ -81,7 +82,7 @@ def title(data, default=u"Untitled."):
     which is the nearest H1 node in context to an element with the `isso-thread` id.
 
     >>> title("asdf")  # doctest: +IGNORE_UNICODE
-    u'Untitled.'
+    'Untitled.'
     >>> title('''
     ... <html>
     ... <head>
@@ -101,7 +102,14 @@ def title(data, default=u"Untitled."):
     ...     </article>
     ... </body>
     ... </html>''')  # doctest: +IGNORE_UNICODE
-    u'Can you find me?'
+    'Can you find me?'
+    >>> title('''
+    ... <html>
+    ... <body>
+    ... <h1>I'm the real title!1
+    ... <section data-title="No way%21" id="isso-thread">
+    ... ''')  # doctest: +IGNORE_UNICODE
+    'No way!'
     """
 
     html = html5lib.parse(data, treebuilder="dom")
@@ -137,6 +145,11 @@ def title(data, default=u"Untitled."):
                 for item in gettext(child):
                     yield item
 
+    try:
+        return unquote(el.attributes["data-title"].value)
+    except (KeyError, AttributeError):
+        pass
+
     while el is not None:  # el.parentNode is None in the very end
 
         visited.append(el)