improve doctests and refactor title extraction
This commit is contained in:
parent
89b1ca8846
commit
cb40c7ca42
@ -4,16 +4,17 @@ import sys
|
|||||||
PY2K = sys.version_info[0] == 2
|
PY2K = sys.version_info[0] == 2
|
||||||
|
|
||||||
if not PY2K:
|
if not PY2K:
|
||||||
# iterkeys = lambda d: iter(d.keys())
|
|
||||||
# iteritems = lambda d: iter(d.items())
|
map, zip, filter = map, zip, filter
|
||||||
|
|
||||||
text_type = str
|
text_type = str
|
||||||
string_types = (str, )
|
string_types = (str, )
|
||||||
|
|
||||||
buffer = memoryview
|
buffer = memoryview
|
||||||
else:
|
else:
|
||||||
# iterkeys = lambda d: d.iterkeys()
|
|
||||||
# iteritems = lambda d: d.iteritems()
|
from itertools import imap, izip, ifilter
|
||||||
|
map, zip, filter = imap, izip, ifilter
|
||||||
|
|
||||||
text_type = unicode
|
text_type = unicode
|
||||||
string_types = (str, unicode)
|
string_types = (str, unicode)
|
||||||
|
@ -117,6 +117,9 @@ class Mixin(object):
|
|||||||
|
|
||||||
|
|
||||||
def threaded(func):
|
def threaded(func):
|
||||||
|
"""
|
||||||
|
Decorator to execute each :param func: call in a separate thread.
|
||||||
|
"""
|
||||||
|
|
||||||
def dec(self, *args, **kwargs):
|
def dec(self, *args, **kwargs):
|
||||||
thread.start_new_thread(func, (self, ) + args, kwargs)
|
thread.start_new_thread(func, (self, ) + args, kwargs)
|
||||||
|
@ -15,10 +15,10 @@ def anonymize(remote_addr):
|
|||||||
Anonymize IPv4 and IPv6 :param remote_addr: to /24 (zero'd)
|
Anonymize IPv4 and IPv6 :param remote_addr: to /24 (zero'd)
|
||||||
and /48 (zero'd).
|
and /48 (zero'd).
|
||||||
|
|
||||||
>>> anonymize(u'12.34.56.78')
|
>>> anonymize(u'12.34.56.78') # doctest: +IGNORE_UNICODE
|
||||||
u'12.34.56.0'
|
'12.34.56.0'
|
||||||
>>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321')
|
>>> anonymize(u'1234:5678:90ab:cdef:fedc:ba09:8765:4321') # doctest: +IGNORE_UNICODE
|
||||||
u'1234:5678:90ab:0000:0000:0000:0000:0000'
|
'1234:5678:90ab:0000:0000:0000:0000:0000'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
ipv4 = ipaddress.IPv4Address(remote_addr)
|
ipv4 = ipaddress.IPv4Address(remote_addr)
|
||||||
|
@ -9,74 +9,17 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
import http.client as httplib
|
import http.client as httplib
|
||||||
|
|
||||||
import html5lib
|
|
||||||
|
|
||||||
from isso.utils import parse
|
from isso.utils import parse
|
||||||
|
|
||||||
|
|
||||||
def urlexists(host, path):
|
def curl(method, host, path, timeout=3):
|
||||||
|
|
||||||
host, port, ssl = parse.host(host)
|
host, port, ssl = parse.host(host)
|
||||||
http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection
|
http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection
|
||||||
|
|
||||||
with closing(http(host, port, timeout=3)) as con:
|
with closing(http(host, port, timeout=timeout)) as con:
|
||||||
try:
|
try:
|
||||||
con.request('HEAD', path)
|
con.request(method, path)
|
||||||
except (httplib.HTTPException, socket.error):
|
except (httplib.HTTPException, socket.error):
|
||||||
return False
|
return None
|
||||||
return con.getresponse().status == 200
|
return con.getresponse()
|
||||||
|
|
||||||
|
|
||||||
def heading(host, path):
|
|
||||||
"""Connect to `host`, GET path and start from #isso-thread to search for
|
|
||||||
a possible heading (h1). Returns `None` if nothing found."""
|
|
||||||
|
|
||||||
host, port, ssl = parse.host(host)
|
|
||||||
http = httplib.HTTPSConnection if ssl else httplib.HTTPConnection
|
|
||||||
|
|
||||||
with closing(http(host, port, timeout=15)) as con:
|
|
||||||
con.request('GET', path)
|
|
||||||
html = html5lib.parse(con.getresponse().read(), treebuilder="dom")
|
|
||||||
|
|
||||||
assert html.lastChild.nodeName == "html"
|
|
||||||
html = html.lastChild
|
|
||||||
|
|
||||||
# aka getElementById
|
|
||||||
el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
|
|
||||||
filter(lambda i: i.attributes.has_key("id"), html.getElementsByTagName("div"))))
|
|
||||||
|
|
||||||
if not el:
|
|
||||||
return "Untitled"
|
|
||||||
|
|
||||||
el = el[0]
|
|
||||||
visited = []
|
|
||||||
|
|
||||||
def recurse(node):
|
|
||||||
for child in node.childNodes:
|
|
||||||
if child.nodeType != child.ELEMENT_NODE:
|
|
||||||
continue
|
|
||||||
if child.nodeName.upper() == "H1":
|
|
||||||
return child
|
|
||||||
if child not in visited:
|
|
||||||
return recurse(child)
|
|
||||||
|
|
||||||
def gettext(rv):
|
|
||||||
for child in rv.childNodes:
|
|
||||||
if child.nodeType == child.TEXT_NODE:
|
|
||||||
yield child.nodeValue
|
|
||||||
if child.nodeType == child.ELEMENT_NODE:
|
|
||||||
for item in gettext(child):
|
|
||||||
yield item
|
|
||||||
|
|
||||||
while el is not None: # el.parentNode is None in the very end
|
|
||||||
|
|
||||||
visited.append(el)
|
|
||||||
rv = recurse(el)
|
|
||||||
|
|
||||||
if rv:
|
|
||||||
return ''.join(gettext(rv)).strip()
|
|
||||||
|
|
||||||
el = el.parentNode
|
|
||||||
|
|
||||||
return "Untitled."
|
|
||||||
|
|
||||||
|
@ -4,11 +4,21 @@ from __future__ import print_function
|
|||||||
import re
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
|
||||||
|
from isso.compat import map, filter, PY2K
|
||||||
|
|
||||||
|
if PY2K: # http://bugs.python.org/issue12984
|
||||||
|
from xml.dom.minidom import NamedNodeMap
|
||||||
|
NamedNodeMap.__contains__ = lambda self, key: self.has_key(key)
|
||||||
|
|
||||||
|
|
||||||
def timedelta(value):
|
def timedelta(value):
|
||||||
"""
|
"""
|
||||||
@ -63,3 +73,78 @@ def host(name):
|
|||||||
if rv.scheme == 'https' and rv.port is None:
|
if rv.scheme == 'https' and rv.port is None:
|
||||||
return (rv.netloc, 443, True)
|
return (rv.netloc, 443, True)
|
||||||
return (rv.netloc.rsplit(':')[0], rv.port or 80, rv.scheme == 'https')
|
return (rv.netloc.rsplit(':')[0], rv.port or 80, rv.scheme == 'https')
|
||||||
|
|
||||||
|
|
||||||
|
def title(data, default=u"Untitled."):
|
||||||
|
"""
|
||||||
|
Extract <h1> title from web page. The title is *probably* the text node,
|
||||||
|
which is the nearest H1 node in context to an element with the `isso-thread` id.
|
||||||
|
|
||||||
|
>>> title("asdf") # doctest: +IGNORE_UNICODE
|
||||||
|
u'Untitled.'
|
||||||
|
>>> title('''
|
||||||
|
... <html>
|
||||||
|
... <head>
|
||||||
|
... <title>Foo!</title>
|
||||||
|
... </head>
|
||||||
|
... <body>
|
||||||
|
... <header>
|
||||||
|
... <h1>generic website title.</h1>
|
||||||
|
... <h2>subtile title.</h2>
|
||||||
|
... </header>
|
||||||
|
... <article>
|
||||||
|
... <header>
|
||||||
|
... <h1>Can you find me?</h1>
|
||||||
|
... </header>
|
||||||
|
... <section id="isso-thread">
|
||||||
|
... </section>
|
||||||
|
... </article>
|
||||||
|
... </body>
|
||||||
|
... </html>''') # doctest: +IGNORE_UNICODE
|
||||||
|
u'Can you find me?'
|
||||||
|
"""
|
||||||
|
|
||||||
|
html = html5lib.parse(data, treebuilder="dom")
|
||||||
|
|
||||||
|
assert html.lastChild.nodeName == "html"
|
||||||
|
html = html.lastChild
|
||||||
|
|
||||||
|
# aka getElementById
|
||||||
|
el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
|
||||||
|
filter(lambda i: "id" in i.attributes,
|
||||||
|
chain(*map(html.getElementsByTagName, ("div", "section"))))))
|
||||||
|
|
||||||
|
if not el:
|
||||||
|
return default
|
||||||
|
|
||||||
|
el = el[0]
|
||||||
|
visited = []
|
||||||
|
|
||||||
|
def recurse(node):
|
||||||
|
for child in node.childNodes:
|
||||||
|
if child.nodeType != child.ELEMENT_NODE:
|
||||||
|
continue
|
||||||
|
if child.nodeName.upper() == "H1":
|
||||||
|
return child
|
||||||
|
if child not in visited:
|
||||||
|
return recurse(child)
|
||||||
|
|
||||||
|
def gettext(rv):
|
||||||
|
for child in rv.childNodes:
|
||||||
|
if child.nodeType == child.TEXT_NODE:
|
||||||
|
yield child.nodeValue
|
||||||
|
if child.nodeType == child.ELEMENT_NODE:
|
||||||
|
for item in gettext(child):
|
||||||
|
yield item
|
||||||
|
|
||||||
|
while el is not None: # el.parentNode is None in the very end
|
||||||
|
|
||||||
|
visited.append(el)
|
||||||
|
rv = recurse(el)
|
||||||
|
|
||||||
|
if rv:
|
||||||
|
return ''.join(gettext(rv)).strip()
|
||||||
|
|
||||||
|
el = el.parentNode
|
||||||
|
|
||||||
|
return default
|
||||||
|
@ -6,7 +6,6 @@ import time
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
|
||||||
|
|
||||||
from itsdangerous import SignatureExpired, BadSignature
|
from itsdangerous import SignatureExpired, BadSignature
|
||||||
|
|
||||||
@ -16,7 +15,7 @@ from werkzeug.exceptions import abort, BadRequest
|
|||||||
from isso.compat import text_type as str
|
from isso.compat import text_type as str
|
||||||
|
|
||||||
from isso import utils, notify, db
|
from isso import utils, notify, db
|
||||||
from isso.utils import http
|
from isso.utils import http, parse
|
||||||
from isso.crypto import pbkdf2
|
from isso.crypto import pbkdf2
|
||||||
|
|
||||||
logger = logging.getLogger("isso")
|
logger = logging.getLogger("isso")
|
||||||
@ -50,9 +49,6 @@ class requires:
|
|||||||
@requires(str, 'uri')
|
@requires(str, 'uri')
|
||||||
def new(app, environ, request, uri):
|
def new(app, environ, request, uri):
|
||||||
|
|
||||||
if uri not in app.db.threads and not http.urlexists(app.conf.get('general', 'host'), uri):
|
|
||||||
return Response('URI does not exist', 404)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(request.get_data().decode('utf-8'))
|
data = json.loads(request.get_data().decode('utf-8'))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@ -74,11 +70,23 @@ def new(app, environ, request, uri):
|
|||||||
data['mode'] = (app.conf.getboolean('moderation', 'enabled') and 2) or 1
|
data['mode'] = (app.conf.getboolean('moderation', 'enabled') and 2) or 1
|
||||||
data['remote_addr'] = utils.anonymize(str(request.remote_addr))
|
data['remote_addr'] = utils.anonymize(str(request.remote_addr))
|
||||||
|
|
||||||
|
# extract site's <h1> title
|
||||||
|
if uri not in app.db.threads:
|
||||||
|
for host in app.conf.getiter('general', 'host'):
|
||||||
|
resp = http.curl('HEAD', host, uri)
|
||||||
|
if resp and resp.status == 200:
|
||||||
|
title = parse.title(resp.read())
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return Response('URI does not exist', 404)
|
||||||
|
else:
|
||||||
|
title = app.db.threads[uri].title
|
||||||
|
|
||||||
with app.lock:
|
with app.lock:
|
||||||
if uri not in app.db.threads:
|
if uri not in app.db.threads:
|
||||||
app.db.threads.new(uri, http.heading(app.conf.get('general', 'host'), uri))
|
app.db.threads.new(uri, title)
|
||||||
logger.info('new thread: %s -> %s', uri, app.db.threads[uri].title)
|
|
||||||
title = app.db.threads[uri].title
|
logger.info('new thread: %s -> %s', uri, title)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with app.lock:
|
with app.lock:
|
||||||
|
@ -18,8 +18,14 @@ from isso import Isso, notify, views, core
|
|||||||
from isso.utils import http
|
from isso.utils import http
|
||||||
from isso.views import comment
|
from isso.views import comment
|
||||||
|
|
||||||
http.heading = lambda *args: "Untitled."
|
class Dummy:
|
||||||
http.urlexists = lambda *args: True
|
|
||||||
|
status = 200
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
http.curl = lambda method, host, path: Dummy()
|
||||||
|
|
||||||
loads = lambda data: json.loads(data.decode('utf-8'))
|
loads = lambda data: json.loads(data.decode('utf-8'))
|
||||||
|
|
||||||
|
@ -10,9 +10,16 @@ from werkzeug.test import Client
|
|||||||
from werkzeug.wrappers import Response
|
from werkzeug.wrappers import Response
|
||||||
|
|
||||||
from isso import Isso, notify, utils, core
|
from isso import Isso, notify, utils, core
|
||||||
|
from isso.utils import http
|
||||||
|
|
||||||
utils.heading = lambda *args: "Untitled."
|
class Dummy:
|
||||||
utils.urlexists = lambda *args: True
|
|
||||||
|
status = 200
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
http.curl = lambda method, host, path: Dummy()
|
||||||
|
|
||||||
loads = lambda data: json.loads(data.decode('utf-8'))
|
loads = lambda data: json.loads(data.decode('utf-8'))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user