You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
164 lines
4.4 KiB
164 lines
4.4 KiB
|
|
from __future__ import print_function
|
|
|
|
import re
|
|
import datetime
|
|
|
|
from itertools import chain
|
|
|
|
try:
|
|
from urllib import unquote
|
|
from urlparse import urlparse
|
|
except ImportError:
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
import html5lib
|
|
|
|
from isso.compat import map, filter, PY2K
|
|
|
|
if PY2K: # http://bugs.python.org/issue12984
|
|
from xml.dom.minidom import NamedNodeMap
|
|
NamedNodeMap.__contains__ = lambda self, key: self.has_key(key)
|
|
|
|
|
|
def timedelta(value):
|
|
"""
|
|
Parse :param value: into :class:`datetime.timedelta`, you can use any (logical)
|
|
combination of Nw, Nd, Nh and Nm, e.g. `1h30m` for 1 hour, 30 minutes or `3w` for
|
|
3 weeks. Raises a ValueError if the input is invalid/unparseable.
|
|
|
|
>>> print(timedelta("3w"))
|
|
21 days, 0:00:00
|
|
>>> print(timedelta("3w 12h 57m"))
|
|
21 days, 12:57:00
|
|
>>> print(timedelta("1h30m37s"))
|
|
1:30:37
|
|
>>> print(timedelta("1asdf3w"))
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: invalid human-readable timedelta
|
|
"""
|
|
|
|
keys = ["weeks", "days", "hours", "minutes", "seconds"]
|
|
regex = "".join(["((?P<%s>\d+)%s ?)?" % (k, k[0]) for k in keys])
|
|
kwargs = {}
|
|
for k, v in re.match(regex, value).groupdict(default="0").items():
|
|
kwargs[k] = int(v)
|
|
|
|
rv = datetime.timedelta(**kwargs)
|
|
if rv == datetime.timedelta():
|
|
raise ValueError("invalid human-readable timedelta")
|
|
return datetime.timedelta(**kwargs)
|
|
|
|
|
|
def host(name):
|
|
"""
|
|
Parse :param name: into `httplib`-compatible host:port.
|
|
|
|
>>> host("http://example.tld/")
|
|
('example.tld', 80, False)
|
|
>>> host("https://example.tld/")
|
|
('example.tld', 443, True)
|
|
>>> host("example.tld")
|
|
('example.tld', 80, False)
|
|
>>> host("example.tld:42")
|
|
('example.tld', 42, False)
|
|
>>> host("https://example.tld:80/")
|
|
('example.tld', 80, True)
|
|
"""
|
|
|
|
if not name.startswith(('http://', 'https://')):
|
|
name = 'http://' + name
|
|
|
|
rv = urlparse(name)
|
|
if rv.scheme == 'https' and rv.port is None:
|
|
return (rv.netloc, 443, True)
|
|
return (rv.netloc.rsplit(':')[0], rv.port or 80, rv.scheme == 'https')
|
|
|
|
|
|
def title(data, default=u"Untitled."):
|
|
"""
|
|
Extract <h1> title from web page. The title is *probably* the text node,
|
|
which is the nearest H1 node in context to an element with the `isso-thread` id.
|
|
|
|
>>> title("asdf") # doctest: +IGNORE_UNICODE
|
|
'Untitled.'
|
|
>>> title('''
|
|
... <html>
|
|
... <head>
|
|
... <title>Foo!</title>
|
|
... </head>
|
|
... <body>
|
|
... <header>
|
|
... <h1>generic website title.</h1>
|
|
... <h2>subtile title.</h2>
|
|
... </header>
|
|
... <article>
|
|
... <header>
|
|
... <h1>Can you find me?</h1>
|
|
... </header>
|
|
... <section id="isso-thread">
|
|
... </section>
|
|
... </article>
|
|
... </body>
|
|
... </html>''') # doctest: +IGNORE_UNICODE
|
|
'Can you find me?'
|
|
>>> title('''
|
|
... <html>
|
|
... <body>
|
|
... <h1>I'm the real title!1
|
|
... <section data-title="No way%21" id="isso-thread">
|
|
... ''') # doctest: +IGNORE_UNICODE
|
|
'No way!'
|
|
"""
|
|
|
|
html = html5lib.parse(data, treebuilder="dom")
|
|
|
|
assert html.lastChild.nodeName == "html"
|
|
html = html.lastChild
|
|
|
|
# aka getElementById, but limited to div and section tags
|
|
el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
|
|
filter(lambda i: "id" in i.attributes,
|
|
chain(*map(html.getElementsByTagName, ("div", "section"))))))
|
|
|
|
if not el:
|
|
return default
|
|
|
|
el = el[0]
|
|
visited = []
|
|
|
|
def recurse(node):
|
|
for child in node.childNodes:
|
|
if child.nodeType != child.ELEMENT_NODE:
|
|
continue
|
|
if child.nodeName.upper() == "H1":
|
|
return child
|
|
if child not in visited:
|
|
return recurse(child)
|
|
|
|
def gettext(rv):
|
|
for child in rv.childNodes:
|
|
if child.nodeType == child.TEXT_NODE:
|
|
yield child.nodeValue
|
|
if child.nodeType == child.ELEMENT_NODE:
|
|
for item in gettext(child):
|
|
yield item
|
|
|
|
try:
|
|
return unquote(el.attributes["data-title"].value)
|
|
except (KeyError, AttributeError):
|
|
pass
|
|
|
|
while el is not None: # el.parentNode is None in the very end
|
|
|
|
visited.append(el)
|
|
rv = recurse(el)
|
|
|
|
if rv:
|
|
return ''.join(gettext(rv)).strip()
|
|
|
|
el = el.parentNode
|
|
|
|
return default
|