diff --git a/isso/compat.py b/isso/compat.py index 888afe0..772d561 100644 --- a/isso/compat.py +++ b/isso/compat.py @@ -4,16 +4,17 @@ import sys PY2K = sys.version_info[0] == 2 if not PY2K: - # iterkeys = lambda d: iter(d.keys()) - # iteritems = lambda d: iter(d.items()) + + map, zip, filter = map, zip, filter text_type = str string_types = (str, ) buffer = memoryview else: - # iterkeys = lambda d: d.iterkeys() - # iteritems = lambda d: d.iteritems() + + from itertools import imap, izip, ifilter + map, zip, filter = imap, izip, ifilter text_type = unicode string_types = (str, unicode) diff --git a/isso/core.py b/isso/core.py index f4721a8..e1636e1 100644 --- a/isso/core.py +++ b/isso/core.py @@ -117,6 +117,9 @@ class Mixin(object): def threaded(func): + """ + Decorator to execute each :param func: call in a separate thread. + """ def dec(self, *args, **kwargs): thread.start_new_thread(func, (self, ) + args, kwargs) diff --git a/isso/utils/__init__.py b/isso/utils/__init__.py index cdcf6ad..12216e7 100644 --- a/isso/utils/__init__.py +++ b/isso/utils/__init__.py @@ -15,10 +15,10 @@ def anonymize(remote_addr): Anonymize IPv4 and IPv6 :param remote_addr: to /24 (zero'd) and /48 (zero'd). - >>> anonymize(u'12.34.56.78') - u'12.34.56.0' - >>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321') - u'1234:5678:90ab:0000:0000:0000:0000:0000' + >>> anonymize(u'12.34.56.78') # doctest: +IGNORE_UNICODE + '12.34.56.0' + >>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321') # doctest: +IGNORE_UNICODE + '1234:5678:90ab:0000:0000:0000:0000:0000' """ try: ipv4 = ipaddress.IPv4Address(remote_addr) diff --git a/isso/utils/http.py b/isso/utils/http.py index 15cdb86..1237cc7 100644 --- a/isso/utils/http.py +++ b/isso/utils/http.py @@ -9,74 +9,17 @@ try: except ImportError: import http.client as httplib -import html5lib - from isso.utils import parse -def urlexists(host, path): +def curl(method, host, path, timeout=3): host, port, ssl = parse.host(host) http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection - with closing(http(host, port, timeout=3)) as con: + with closing(http(host, port, timeout=timeout)) as con: try: - con.request('HEAD', path) + con.request(method, path) except (httplib.HTTPException, socket.error): - return False - return con.getresponse().status == 200 - - -def heading(host, path): - """Connect to `host`, GET path and start from #isso-thread to search for - a possible heading (h1). Returns `None` if nothing found.""" - - host, port, ssl = parse.host(host) - http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection - - with closing(http(host, port, timeout=15)) as con: - con.request('GET', path) - html = html5lib.parse(con.getresponse().read(), treebuilder="dom") - - assert html.lastChild.nodeName == "html" - html = html.lastChild - - # aka getElementById - el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", - filter(lambda i: i.attributes.has_key("id"), html.getElementsByTagName("div")))) - - if not el: - return "Untitled" - - el = el[0] - visited = [] - - def recurse(node): - for child in node.childNodes: - if child.nodeType != child.ELEMENT_NODE: - continue - if child.nodeName.upper() == "H1": - return child - if child not in visited: - return recurse(child) - - def gettext(rv): - for child in rv.childNodes: - if child.nodeType == child.TEXT_NODE: - yield child.nodeValue - if child.nodeType == child.ELEMENT_NODE: - for item in gettext(child): - yield item - - while el is not None: # el.parentNode is None in the very end - - visited.append(el) - rv = recurse(el) - - if rv: - return ''.join(gettext(rv)).strip() - - el = el.parentNode - - return "Untitled." - + return None + return con.getresponse() diff --git a/isso/utils/parse.py b/isso/utils/parse.py index 42f79f9..6af6425 100644 --- a/isso/utils/parse.py +++ b/isso/utils/parse.py @@ -4,11 +4,21 @@ from __future__ import print_function import re import datetime +from itertools import chain + try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse +import html5lib + +from isso.compat import map, filter, PY2K + +if PY2K: # http://bugs.python.org/issue12984 + from xml.dom.minidom import NamedNodeMap + NamedNodeMap.__contains__ = lambda self, key: self.has_key(key) + def timedelta(value): """ @@ -63,3 +73,78 @@ def host(name): if rv.scheme == 'https' and rv.port is None: return (rv.netloc, 443, True) return (rv.netloc.rsplit(':')[0], rv.port or 80, rv.scheme == 'https') + + +def title(data, default=u"Untitled."): + """ + Extract