improve doctests and refactor title extraction

2013-10-29 12:01:45 +01:00 · 2013-10-29 12:01:45 +01:00 · cb40c7ca42
commit cb40c7ca42
parent 89b1ca8846
9 changed files with 137 additions and 83 deletions
--- a/isso/compat.py
+++ b/isso/compat.py
@ -4,16 +4,17 @@ import sys
 PY2K = sys.version_info[0] == 2
 if not PY2K:
-    # iterkeys = lambda d: iter(d.keys())
+
-    # iteritems = lambda d: iter(d.items())
+    map, zip, filter = map, zip, filter
    text_type = str
    string_types = (str, )
    buffer = memoryview
 else:
-    # iterkeys = lambda d: d.iterkeys()
+
-    # iteritems = lambda d: d.iteritems()
+    from itertools import imap, izip, ifilter
    map, zip, filter = imap, izip, ifilter
    text_type = unicode
    string_types = (str, unicode)
--- a/isso/core.py
+++ b/isso/core.py
@ -117,6 +117,9 @@ class Mixin(object):
 def threaded(func):
    """
    Decorator to execute each :param func: call in a separate thread.
    """
    def dec(self, *args, **kwargs):
        thread.start_new_thread(func, (self, ) + args, kwargs)
--- a/isso/utils/init.py
+++ b/isso/utils/init.py
@ -15,10 +15,10 @@ def anonymize(remote_addr):
    Anonymize IPv4 and IPv6 :param remote_addr: to /24 (zero'd)
    and /48 (zero'd).
-    >>> anonymize(u'12.34.56.78')
+    >>> anonymize(u'12.34.56.78')  # doctest: +IGNORE_UNICODE
-    u'12.34.56.0'
+    '12.34.56.0'
-    >>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321')
+    >>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321') # doctest: +IGNORE_UNICODE
-    u'1234:5678:90ab:0000:0000:0000:0000:0000'
+    '1234:5678:90ab:0000:0000:0000:0000:0000'
    """
    try:
        ipv4 = ipaddress.IPv4Address(remote_addr)
--- a/isso/utils/http.py
+++ b/isso/utils/http.py
@ -9,74 +9,17 @@ try:
 except ImportError:
    import http.client as httplib
 import html5lib
 from isso.utils import parse
-def urlexists(host, path):
+def curl(method, host, path, timeout=3):
    host, port, ssl = parse.host(host)
    http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection
-    with closing(http(host, port, timeout=3)) as con:
+    with closing(http(host, port, timeout=timeout)) as con:
        try:
-            con.request('HEAD', path)
+            con.request(method, path)
        except (httplib.HTTPException, socket.error):
-            return False
+            return None
-        return con.getresponse().status == 200
+        return con.getresponse()
 def heading(host, path):
    """Connect to `host`, GET path and start from #isso-thread to search for
    a possible heading (h1). Returns `None` if nothing found."""
    host, port, ssl = parse.host(host)
    http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection
    with closing(http(host, port, timeout=15)) as con:
        con.request('GET', path)
        html = html5lib.parse(con.getresponse().read(), treebuilder="dom")
    assert html.lastChild.nodeName == "html"
    html = html.lastChild
    # aka getElementById
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: i.attributes.has_key("id"), html.getElementsByTagName("div"))))
    if not el:
        return "Untitled"
    el = el[0]
    visited = []
    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)
    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item
    while el is not None:  # el.parentNode is None in the very end
        visited.append(el)
        rv = recurse(el)
        if rv:
            return ''.join(gettext(rv)).strip()
        el = el.parentNode
    return "Untitled."
--- a/isso/utils/parse.py
+++ b/isso/utils/parse.py
@ -4,11 +4,21 @@ from __future__ import print_function
 import re
 import datetime
 from itertools import chain
 try:
    from urlparse import urlparse
 except ImportError:
    from urllib.parse import urlparse
 import html5lib
 from isso.compat import map, filter, PY2K
 if PY2K:  # http://bugs.python.org/issue12984
    from xml.dom.minidom import NamedNodeMap
    NamedNodeMap.__contains__ = lambda self, key: self.has_key(key)
 def timedelta(value):
    """
@ -63,3 +73,78 @@ def host(name):
    if rv.scheme == 'https' and rv.port is None:
        return (rv.netloc, 443, True)
    return (rv.netloc.rsplit(':')[0], rv.port or 80, rv.scheme == 'https')
 def title(data, default=u"Untitled."):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.
    >>> title("asdf")  # doctest: +IGNORE_UNICODE
    u'Untitled.'
    >>> title('''
    ... <html>
    ... <head>
    ...     <title>Foo!</title>
    ... </head>
    ... <body>
    ...     <header>
    ...         <h1>generic website title.</h1>
    ...         <h2>subtile title.</h2>
    ...     </header>
    ...     <article>
    ...         <header>
    ...             <h1>Can you find me?</h1>
    ...         </header>
    ...         <section id="isso-thread">
    ...         </section>
    ...     </article>
    ... </body>
    ... </html>''')  # doctest: +IGNORE_UNICODE
    u'Can you find me?'
    """
    html = html5lib.parse(data, treebuilder="dom")
    assert html.lastChild.nodeName == "html"
    html = html.lastChild
    # aka getElementById
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: "id" in i.attributes,
                     chain(*map(html.getElementsByTagName, ("div", "section"))))))
    if not el:
        return default
    el = el[0]
    visited = []
    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)
    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item
    while el is not None:  # el.parentNode is None in the very end
        visited.append(el)
        rv = recurse(el)
        if rv:
            return ''.join(gettext(rv)).strip()
        el = el.parentNode
    return default
--- a/isso/views/comment.py
+++ b/isso/views/comment.py
@ -6,7 +6,6 @@ import time
 import hashlib
 import logging
 import sqlite3
 import logging
 from itsdangerous import SignatureExpired, BadSignature
@ -16,7 +15,7 @@ from werkzeug.exceptions import abort, BadRequest
 from isso.compat import text_type as str
 from isso import utils, notify, db
-from isso.utils import http
+from isso.utils import http, parse
 from isso.crypto import pbkdf2
 logger = logging.getLogger("isso")
@ -50,9 +49,6 @@ class requires:
@requires(str, 'uri')
 def new(app, environ, request, uri):
    if uri not in app.db.threads and not http.urlexists(app.conf.get('general', 'host'), uri):
        return Response('URI does not exist', 404)
    try:
        data = json.loads(request.get_data().decode('utf-8'))
    except ValueError:
@ -74,11 +70,23 @@ def new(app, environ, request, uri):
    data['mode'] = (app.conf.getboolean('moderation', 'enabled') and 2) or 1
    data['remote_addr'] = utils.anonymize(str(request.remote_addr))
    # extract site's <h1> title
    if uri not in app.db.threads:
        for host in app.conf.getiter('general', 'host'):
            resp = http.curl('HEAD', host, uri)
            if resp and resp.status == 200:
                title = parse.title(resp.read())
                break
        else:
            return Response('URI does not exist', 404)
    else:
        title = app.db.threads[uri].title
    with app.lock:
        if uri not in app.db.threads:
-            app.db.threads.new(uri, http.heading(app.conf.get('general', 'host'), uri))
+            app.db.threads.new(uri, title)
-            logger.info('new thread: %s -> %s', uri, app.db.threads[uri].title)
+
-    title = app.db.threads[uri].title
+    logger.info('new thread: %s -> %s', uri, title)
    try:
        with app.lock:
--- a/specs/test_comment.py
+++ b/specs/test_comment.py
@ -18,8 +18,14 @@ from isso import Isso, notify, views, core
 from isso.utils import http
 from isso.views import comment
-http.heading = lambda *args: "Untitled."
+class Dummy:
-http.urlexists = lambda *args: True
+
    status = 200
    def read(self):
        return ''
 http.curl = lambda method, host, path: Dummy()
 loads = lambda data: json.loads(data.decode('utf-8'))
--- a/specs/test_vote.py
+++ b/specs/test_vote.py
@ -10,9 +10,16 @@ from werkzeug.test import Client
 from werkzeug.wrappers import Response
 from isso import Isso, notify, utils, core
 from isso.utils import http
-utils.heading = lambda *args: "Untitled."
+class Dummy:
-utils.urlexists = lambda *args: True
+
    status = 200
    def read(self):
        return ''
 http.curl = lambda method, host, path: Dummy()
 loads = lambda data: json.loads(data.decode('utf-8'))
--- a/tox.ini
+++ b/tox.ini
@ -15,6 +15,7 @@ deps =
 deps =
    nose
    ipaddress
    doctest-ignore-unicode
 commands=
-    nosetests --with-doctest isso/
+    nosetests --with-doctest --with-doctest-ignore-unicode isso/
    nosetests specs/