You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
4.4 KiB
161 lines
4.4 KiB
# -*- encoding: utf-8 -*-
|
|
|
|
from __future__ import division
|
|
|
|
import socket
|
|
import httplib
|
|
|
|
import random
|
|
import hashlib
|
|
|
|
from string import ascii_letters, digits
|
|
from urlparse import urlparse
|
|
from contextlib import closing
|
|
|
|
import html5lib
|
|
import ipaddress
|
|
|
|
|
|
def normalize(host):
|
|
|
|
if not host.startswith(('http://', 'https://')):
|
|
host = 'https://' + host
|
|
|
|
rv = urlparse(host)
|
|
if rv.scheme == 'https':
|
|
return (rv.netloc, 443)
|
|
return (rv.netloc.rsplit(':')[0], rv.port or 80)
|
|
|
|
|
|
def urlexists(host, path):
|
|
|
|
host, port = normalize(host)
|
|
http = httplib.HTTPSConnection if port == 443 else httplib.HTTPConnection
|
|
|
|
with closing(http(host, port, timeout=3)) as con:
|
|
try:
|
|
con.request('HEAD', path)
|
|
except (httplib.HTTPException, socket.error):
|
|
return False
|
|
return con.getresponse().status == 200
|
|
|
|
|
|
def heading(host, path):
|
|
"""Connect to `host`, GET path and start from #isso-thread to search for
|
|
a possible heading (h1). Returns `None` if nothing found."""
|
|
|
|
host, port = normalize(host)
|
|
http = httplib.HTTPSConnection if port == 443 else httplib.HTTPConnection
|
|
|
|
with closing(http(host, port, timeout=15)) as con:
|
|
con.request('GET', path)
|
|
html = html5lib.parse(con.getresponse().read(), treebuilder="dom")
|
|
|
|
assert html.lastChild.nodeName == "html"
|
|
html = html.lastChild
|
|
|
|
# aka getElementById
|
|
el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
|
|
filter(lambda i: i.attributes.has_key("id"), html.getElementsByTagName("div"))))
|
|
|
|
if not el:
|
|
return None
|
|
|
|
el = el[0]
|
|
visited = []
|
|
|
|
def recurse(node):
|
|
for child in node.childNodes:
|
|
if child.nodeType != child.ELEMENT_NODE:
|
|
continue
|
|
if child.nodeName.upper() == "H1":
|
|
return child
|
|
if child not in visited:
|
|
return recurse(child)
|
|
|
|
def gettext(rv):
|
|
for child in rv.childNodes:
|
|
if child.nodeType == child.TEXT_NODE:
|
|
yield child.nodeValue
|
|
if child.nodeType == child.ELEMENT_NODE:
|
|
for item in gettext(child):
|
|
yield item
|
|
|
|
while el is not None: # el.parentNode is None in the very end
|
|
|
|
visited.append(el)
|
|
rv = recurse(el)
|
|
|
|
if rv:
|
|
return ''.join(gettext(rv)).strip()
|
|
|
|
el = el.parentNode
|
|
|
|
return None
|
|
|
|
|
|
def anonymize(remote_addr):
|
|
try:
|
|
ipv4 = ipaddress.IPv4Address(remote_addr)
|
|
return ''.join(ipv4.exploded.rsplit('.', 1)[0]) + '.' + '0'
|
|
except ipaddress.AddressValueError:
|
|
ipv6 = ipaddress.IPv6Address(remote_addr)
|
|
if ipv6.ipv4_mapped is not None:
|
|
return anonymize(ipv6.ipv4_mapped)
|
|
return ipv6.exploded.rsplit(':', 5)[0] + ':' + ':'.join(['0000']*3)
|
|
|
|
|
|
def salt(value, s=u'\x082@t9*\x17\xad\xc1\x1c\xa5\x98'):
|
|
return hashlib.sha1((value + s).encode('utf-8')).hexdigest()
|
|
|
|
|
|
def mksecret(length):
|
|
return ''.join(random.choice(ascii_letters + digits) for x in range(length))
|
|
|
|
|
|
class Bloomfilter:
|
|
"""A space-efficient probabilistic data structure. False-positive rate:
|
|
|
|
* 1e-05 for <80 elements
|
|
* 1e-04 for <105 elements
|
|
* 1e-03 for <142 elements
|
|
|
|
Uses a 256 byte array (2048 bits) and 11 hash functions. 256 byte because
|
|
of space efficiency (array is saved for each comment) and 11 hash functions
|
|
because of best overall false-positive rate in that range.
|
|
|
|
-- via Raymond Hettinger
|
|
http://code.activestate.com/recipes/577684-bloom-filter/
|
|
"""
|
|
|
|
def __init__(self, array=bytearray(256), elements=0, iterable=()):
|
|
self.array = array
|
|
self.elements = elements
|
|
self.k = 11
|
|
self.m = len(array) * 8
|
|
|
|
for item in iterable:
|
|
self.add(item)
|
|
|
|
def get_probes(self, key):
|
|
h = int(hashlib.sha256(key.encode()).hexdigest(), 16)
|
|
for _ in range(self.k):
|
|
yield h & self.m - 1
|
|
h >>= self.k
|
|
|
|
def add(self, key):
|
|
for i in self.get_probes(key):
|
|
self.array[i//8] |= 2 ** (i%8)
|
|
self.elements += 1
|
|
|
|
@property
|
|
def density(self):
|
|
c = ''.join(format(x, '08b') for x in self.array)
|
|
return c.count('1') / len(c)
|
|
|
|
def __contains__(self, key):
|
|
return all(self.array[i//8] & (2 ** (i%8)) for i in self.get_probes(key))
|
|
|
|
def __len__(self):
|
|
return self.elements
|