368 lines
11 KiB
Python
368 lines
11 KiB
Python
# -*- encoding: utf-8 -*-
|
|
|
|
from __future__ import division, print_function, unicode_literals
|
|
|
|
import functools
|
|
import io
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import textwrap
|
|
|
|
from time import mktime, strptime, time
|
|
from collections import defaultdict
|
|
|
|
from isso.utils import anonymize
|
|
from isso.compat import string_types
|
|
|
|
try:
|
|
input = raw_input
|
|
except NameError:
|
|
pass
|
|
|
|
try:
|
|
from urlparse import urlparse
|
|
except ImportError:
|
|
from urllib.parse import urlparse
|
|
|
|
from xml.etree import ElementTree
|
|
|
|
logger = logging.getLogger("isso")
|
|
|
|
|
|
def strip(val):
|
|
if isinstance(val, string_types):
|
|
return val.strip()
|
|
return val
|
|
|
|
|
|
class Progress(object):
|
|
|
|
def __init__(self, end):
|
|
self.end = end or 1
|
|
|
|
self.istty = sys.stdout.isatty()
|
|
self.last = 0
|
|
|
|
def update(self, i, message):
|
|
|
|
if not self.istty or message is None:
|
|
return
|
|
|
|
cols = int((os.popen('stty size', 'r').read()).split()[1])
|
|
message = message[:cols - 7]
|
|
|
|
if time() - self.last > 0.2:
|
|
sys.stdout.write("\r{0}".format(" " * cols))
|
|
sys.stdout.write("\r[{0:.0%}] {1}".format(i / self.end, message))
|
|
sys.stdout.flush()
|
|
self.last = time()
|
|
|
|
def finish(self, message):
|
|
self.last = 0
|
|
self.update(self.end, message + "\n")
|
|
|
|
|
|
class Disqus(object):
|
|
|
|
ns = '{http://disqus.com}'
|
|
internals = '{http://disqus.com/disqus-internals}'
|
|
|
|
def __init__(self, db, xmlfile, empty_id=False):
|
|
self.threads = set([])
|
|
self.comments = set([])
|
|
|
|
self.db = db
|
|
self.xmlfile = xmlfile
|
|
self.empty_id = empty_id
|
|
|
|
def insert(self, thread, posts):
|
|
|
|
path = urlparse(thread.find('%slink' % Disqus.ns).text).path
|
|
remap = dict()
|
|
|
|
if path not in self.db.threads:
|
|
self.db.threads.new(path, thread.find(
|
|
Disqus.ns + 'title').text.strip())
|
|
|
|
for item in sorted(posts, key=lambda k: k['created']):
|
|
|
|
dsq_id = item.pop('dsq:id')
|
|
item['parent'] = remap.get(item.pop('dsq:parent', None))
|
|
rv = self.db.comments.add(path, item)
|
|
remap[dsq_id] = rv["id"]
|
|
|
|
self.comments.update(set(remap.keys()))
|
|
|
|
def migrate(self):
|
|
|
|
tree = ElementTree.parse(self.xmlfile)
|
|
res = defaultdict(list)
|
|
|
|
for post in tree.findall(Disqus.ns + 'post'):
|
|
email = post.find('{0}author/{0}email'.format(Disqus.ns))
|
|
ip = post.find(Disqus.ns + 'ipAddress')
|
|
|
|
item = {
|
|
'dsq:id': post.attrib.get(Disqus.internals + 'id'),
|
|
'text': post.find(Disqus.ns + 'message').text,
|
|
'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text,
|
|
'email': email.text if email is not None else '',
|
|
'created': mktime(strptime(
|
|
post.find(Disqus.ns + 'createdAt').text, '%Y-%m-%dT%H:%M:%SZ')),
|
|
'remote_addr': anonymize(ip.text if ip is not None else '0.0.0.0'),
|
|
'mode': 1 if post.find(Disqus.ns + "isDeleted").text == "false" else 4
|
|
}
|
|
|
|
if post.find(Disqus.ns + 'parent') is not None:
|
|
item['dsq:parent'] = post.find(
|
|
Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id')
|
|
|
|
res[post.find('%sthread' % Disqus.ns).attrib.get(
|
|
Disqus.internals + 'id')].append(item)
|
|
|
|
progress = Progress(len(tree.findall(Disqus.ns + 'thread')))
|
|
for i, thread in enumerate(tree.findall(Disqus.ns + 'thread')):
|
|
progress.update(i, thread.find(Disqus.ns + 'id').text)
|
|
|
|
# skip (possibly?) duplicate, but empty thread elements
|
|
if thread.find(Disqus.ns + 'id').text is None and not self.empty_id:
|
|
continue
|
|
|
|
id = thread.attrib.get(Disqus.internals + 'id')
|
|
if id in res:
|
|
self.threads.add(id)
|
|
self.insert(thread, res[id])
|
|
|
|
# in case a comment has been deleted (and no further childs)
|
|
self.db.comments._remove_stale()
|
|
|
|
progress.finish("{0} threads, {1} comments".format(
|
|
len(self.threads), len(self.comments)))
|
|
|
|
orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"),
|
|
tree.findall(Disqus.ns + "post"))) - self.comments
|
|
if orphans and not self.threads:
|
|
print("Isso couldn't import any thread, try again with --empty-id")
|
|
elif orphans:
|
|
print("Found %i orphans:" % len(orphans))
|
|
for post in tree.findall(Disqus.ns + "post"):
|
|
if post.attrib.get(Disqus.internals + "id") not in orphans:
|
|
continue
|
|
|
|
email = post.find("{0}author/{0}email".format(Disqus.ns))
|
|
print(" * {0} by {1} <{2}>".format(
|
|
post.attrib.get(Disqus.internals + "id"),
|
|
post.find("{0}author/{0}name".format(Disqus.ns)).text,
|
|
email.text if email is not None else ""))
|
|
print(textwrap.fill(post.find(Disqus.ns + "message").text,
|
|
initial_indent=" ", subsequent_indent=" "))
|
|
print("")
|
|
|
|
|
|
class WordPress(object):
|
|
|
|
ns = "{http://wordpress.org/export/1.0/}"
|
|
|
|
def __init__(self, db, xmlfile):
|
|
self.db = db
|
|
self.xmlfile = xmlfile
|
|
self.count = 0
|
|
|
|
for line in io.open(xmlfile, encoding="utf-8"):
|
|
m = WordPress.detect(line)
|
|
if m:
|
|
self.ns = WordPress.ns.replace("1.0", m.group(1))
|
|
break
|
|
else:
|
|
logger.warn("No WXR namespace found, assuming 1.0")
|
|
|
|
def insert(self, thread):
|
|
|
|
url = urlparse(thread.find("link").text)
|
|
path = url.path
|
|
|
|
if url.query:
|
|
path += "?" + url.query
|
|
|
|
self.db.threads.new(path, thread.find("title").text.strip())
|
|
|
|
comments = list(map(self.Comment, thread.findall(self.ns + "comment")))
|
|
comments.sort(key=lambda k: k["id"])
|
|
|
|
remap = {}
|
|
ids = set(c["id"] for c in comments)
|
|
|
|
self.count += len(ids)
|
|
|
|
while comments:
|
|
for i, item in enumerate(comments):
|
|
if item["parent"] in ids:
|
|
continue
|
|
|
|
item["parent"] = remap.get(item["parent"], None)
|
|
rv = self.db.comments.add(path, item)
|
|
remap[item["id"]] = rv["id"]
|
|
|
|
ids.remove(item["id"])
|
|
comments.pop(i)
|
|
|
|
break
|
|
else:
|
|
# should never happen, but... it's WordPress.
|
|
return
|
|
|
|
def migrate(self):
|
|
|
|
tree = ElementTree.parse(self.xmlfile)
|
|
|
|
skip = 0
|
|
items = tree.findall("channel/item")
|
|
|
|
progress = Progress(len(items))
|
|
for i, thread in enumerate(items):
|
|
if thread.find("title").text is None or thread.find(self.ns + "comment") is None:
|
|
skip += 1
|
|
continue
|
|
|
|
progress.update(i, thread.find("title").text)
|
|
self.insert(thread)
|
|
|
|
progress.finish("{0} threads, {1} comments".format(
|
|
len(items) - skip, self.count))
|
|
|
|
def Comment(self, el):
|
|
return {
|
|
"text": strip(el.find(self.ns + "comment_content").text),
|
|
"author": strip(el.find(self.ns + "comment_author").text),
|
|
"email": strip(el.find(self.ns + "comment_author_email").text),
|
|
"website": strip(el.find(self.ns + "comment_author_url").text),
|
|
"remote_addr": anonymize(
|
|
strip(el.find(self.ns + "comment_author_IP").text)),
|
|
"created": mktime(strptime(
|
|
strip(el.find(self.ns + "comment_date_gmt").text),
|
|
"%Y-%m-%d %H:%M:%S")),
|
|
"mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2,
|
|
"id": int(el.find(self.ns + "comment_id").text),
|
|
"parent": int(el.find(self.ns + "comment_parent").text) or None
|
|
}
|
|
|
|
@classmethod
|
|
def detect(cls, peek):
|
|
return re.compile("http://wordpress.org/export/(1\\.\\d)/").search(peek)
|
|
|
|
|
|
class Generic(object):
|
|
"""A generic importer.
|
|
|
|
The source format is a json with the following format:
|
|
|
|
A list of threads, each item being a dict with the following data:
|
|
|
|
- id: a text representing the unique thread id
|
|
- title: the title of the thread
|
|
- comments: the list of comments
|
|
|
|
Each item in that list of comments is a dict with the following data:
|
|
|
|
- id: an integer with the unique id of the comment inside the thread (it can be repeated
|
|
among different threads); this will be used to order the comment inside the thread
|
|
- author: the author's name
|
|
- email: the author's email
|
|
- website: the author's website
|
|
- remote_addr: the author's IP
|
|
- created: a timestamp, in the format "%Y-%m-%d %H:%M:%S"
|
|
"""
|
|
|
|
def __init__(self, db, json_file):
|
|
self.db = db
|
|
self.json_file = json_file
|
|
self.count = 0
|
|
|
|
def insert(self, thread):
|
|
"""Process a thread and insert its comments in the DB."""
|
|
thread_id = thread['id']
|
|
title = thread['title']
|
|
self.db.threads.new(thread_id, title)
|
|
|
|
comments = list(map(self._build_comment, thread['comments']))
|
|
comments.sort(key=lambda comment: comment['id'])
|
|
self.count += len(comments)
|
|
for comment in comments:
|
|
self.db.comments.add(thread_id, comment)
|
|
|
|
def migrate(self):
|
|
"""Process the input file and fill the DB."""
|
|
with io.open(self.json_file, 'rt', encoding='utf8') as fh:
|
|
threads = json.load(fh)
|
|
progress = Progress(len(threads))
|
|
|
|
for i, thread in enumerate(threads):
|
|
progress.update(i, str(i))
|
|
self.insert(thread)
|
|
|
|
progress.finish("{0} threads, {1} comments".format(len(threads), self.count))
|
|
|
|
def _build_comment(self, raw_comment):
|
|
return {
|
|
"text": raw_comment['text'],
|
|
"author": raw_comment['author'],
|
|
"email": raw_comment['email'],
|
|
"website": raw_comment['website'],
|
|
"created": mktime(strptime(raw_comment['created'], "%Y-%m-%d %H:%M:%S")),
|
|
"mode": 1,
|
|
"id": int(raw_comment['id']),
|
|
"parent": None,
|
|
"remote_addr": raw_comment["remote_addr"],
|
|
}
|
|
|
|
@classmethod
|
|
def detect(cls, peek):
|
|
"""Return if peek looks like the beginning of a JSON file.
|
|
|
|
Note that we can not check the JSON properly as we only receive here
|
|
the original file truncated.
|
|
"""
|
|
return peek.startswith("[{")
|
|
|
|
|
|
def autodetect(peek):
|
|
|
|
if 'xmlns="http://disqus.com' in peek:
|
|
return Disqus
|
|
|
|
m = WordPress.detect(peek)
|
|
if m:
|
|
return WordPress
|
|
|
|
if Generic.detect(peek):
|
|
return Generic
|
|
|
|
return None
|
|
|
|
|
|
def dispatch(type, db, dump, empty_id=False):
|
|
if db.execute("SELECT * FROM comments").fetchone():
|
|
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
|
|
raise SystemExit("Abort.")
|
|
|
|
if type == "disqus":
|
|
cls = Disqus
|
|
elif type == "wordpress":
|
|
cls = WordPress
|
|
elif type == "generic":
|
|
cls = Generic
|
|
else:
|
|
with io.open(dump, encoding="utf-8") as fp:
|
|
cls = autodetect(fp.read(io.DEFAULT_BUFFER_SIZE))
|
|
|
|
if cls is None:
|
|
raise SystemExit("Unknown format, abort.")
|
|
|
|
if cls is Disqus:
|
|
cls = functools.partial(cls, empty_id=empty_id)
|
|
|
|
cls(db, dump).migrate()
|