from __future__ import print_function, unicode_literals from itertools import chain try: from urllib import unquote except ImportError: from urllib.parse import unquote import html5lib from isso.compat import map, filter, PY2K if PY2K: # http://bugs.python.org/issue12984 from xml.dom.minidom import NamedNodeMap NamedNodeMap.__contains__ = lambda self, key: self.has_key(key) # noqa def thread(data, default=u"Untitled.", id=None): """ Extract

title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", filter(lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return id, default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item try: id = unquote(el.attributes["data-isso-id"].value) except (KeyError, AttributeError): pass try: return id, unquote(el.attributes["data-title"].value) except (KeyError, AttributeError): pass while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return id, ''.join(gettext(rv)).strip() el = el.parentNode return id, default