isso/isso/utils/parse.py


from __future__ import print_function, unicode_literals

from itertools import chain


try:
    from urllib import unquote
except ImportError:
    from urllib.parse import unquote

import html5lib

from isso.compat import map, filter, PY2K

if PY2K:  # http://bugs.python.org/issue12984
    from xml.dom.minidom import NamedNodeMap
    NamedNodeMap.__contains__ = lambda self, key: self.has_key(key)  # noqa


def thread(data, default=u"Untitled.", id=None):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
                     filter(lambda i: "id" in i.attributes,
                            chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return id, default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    try:
        id = unquote(el.attributes["data-isso-id"].value)
    except (KeyError, AttributeError):
        pass

    try:
        return id, unquote(el.attributes["data-title"].value)
    except (KeyError, AttributeError):
        pass

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return id, ''.join(gettext(rv)).strip()

        el = el.parentNode

    return id, default