refactor disqus migration code into a class

This commit is contained in:
Martin Zimmermann 2014-04-30 15:07:11 +02:00
parent cb36107eda
commit 910da2a6c0
3 changed files with 91 additions and 75 deletions

View File

@ -214,11 +214,17 @@ def main():
conf = Config.load(args.conf) conf = Config.load(args.conf)
if args.command == "import": if args.command == "import":
xxx = tempfile.NamedTemporaryFile()
dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name
conf.set("guard", "enabled", "off") conf.set("guard", "enabled", "off")
migrate.disqus(db.SQLite3(dbpath, conf), args.dump)
if args.dryrun:
xxx = tempfile.NamedTemporaryFile()
dbpath = xxx.name
else:
dbpath = conf.get("general", "dbpath")
mydb = db.SQLite3(dbpath, conf)
migrate.dispatch(mydb, args.dump)
sys.exit(0) sys.exit(0)
if not any(conf.getiter("general", "host")): if not any(conf.getiter("general", "host")):

View File

@ -21,94 +21,104 @@ except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
ns = '{http://disqus.com}'
dsq = '{http://disqus.com/disqus-internals}'
threads = set([]) class Disqus(object):
comments = set([])
ns = '{http://disqus.com}'
internals = '{http://disqus.com/disqus-internals}'
def insert(db, thread, posts): def __init__(self, db, xmlfile):
self.threads = set([])
self.comments = set([])
path = urlparse(thread.find('%slink' % ns).text).path self.db = db
remap = dict() self.xmlfile = xmlfile
if path not in db.threads: def insert(self, thread, posts):
db.threads.new(path, thread.find('%stitle' % ns).text.strip())
for item in sorted(posts, key=lambda k: k['created']): path = urlparse(thread.find('%slink' % Disqus.ns).text).path
remap = dict()
dsq_id = item.pop('dsq:id') if path not in self.db.threads:
item['parent'] = remap.get(item.pop('dsq:parent', None)) self.db.threads.new(path, thread.find(Disqus.ns + 'title').text.strip())
rv = db.comments.add(path, item)
remap[dsq_id] = rv["id"]
comments.update(set(remap.keys())) for item in sorted(posts, key=lambda k: k['created']):
dsq_id = item.pop('dsq:id')
item['parent'] = remap.get(item.pop('dsq:parent', None))
rv = self.db.comments.add(path, item)
remap[dsq_id] = rv["id"]
def disqus(db, xmlfile): self.comments.update(set(remap.keys()))
if db.execute("SELECT * FROM comments").fetchone(): def migrate(self):
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
tree = ElementTree.parse(xmlfile) tree = ElementTree.parse(self.xmlfile)
res = defaultdict(list) res = defaultdict(list)
for post in tree.findall('%spost' % ns): for post in tree.findall('%spost' % Disqus.ns):
item = { item = {
'dsq:id': post.attrib.get(dsq + 'id'), 'dsq:id': post.attrib.get(Disqus.internals + 'id'),
'text': post.find('%smessage' % ns).text, 'text': post.find(Disqus.ns + 'message').text,
'author': post.find('%sauthor/%sname' % (ns, ns)).text, 'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text,
'email': post.find('%sauthor/%semail' % (ns, ns)).text, 'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text,
'created': mktime(strptime( 'created': mktime(strptime(
post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')), post.find(Disqus.ns + 'createdAt').text, '%Y-%m-%dT%H:%M:%SZ')),
'remote_addr': '127.0.0.0', 'remote_addr': '127.0.0.0',
'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4 'mode': 1 if post.find(Disqus.ns + "isDeleted").text == "false" else 4
} }
if post.find(ns + 'parent') is not None: if post.find(Disqus.ns + 'parent') is not None:
item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id') item['dsq:parent'] = post.find(Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id')
res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item) res[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item)
num = len(tree.findall('%sthread' % ns)) num = len(tree.findall(Disqus.ns + 'thread'))
cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1]) cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1])
for i, thread in enumerate(tree.findall('%sthread' % ns)): for i, thread in enumerate(tree.findall(Disqus.ns + 'thread')):
if int(round((i+1)/num, 2) * 100) % 13 == 0: if int(round((i+1)/num, 2) * 100) % 13 == 0:
sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find(Disqus.ns + 'id').text))
sys.stdout.flush()
sys.stdout.write("\r%s" % (" "*cols)) # skip (possibly?) duplicate, but empty thread elements
sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text)) if thread.find(Disqus.ns + 'id').text is None:
sys.stdout.flush()
# skip (possibly?) duplicate, but empty thread elements
if thread.find('%sid' % ns).text is None:
continue
id = thread.attrib.get(dsq + 'id')
if id in res:
threads.add(id)
insert(db, thread, res[id])
# in case a comment has been deleted (and no further childs)
db.comments._remove_stale()
sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments)))
orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments
if orphans:
print("Found %i orphans:" % len(orphans))
for post in tree.findall("%spost" % ns):
if post.attrib.get(dsq + "id") not in orphans:
continue continue
print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"), id = thread.attrib.get(Disqus.internals + 'id')
post.find("%sauthor/%sname" % (ns, ns)).text, if id in res:
post.find("%sauthor/%semail" % (ns, ns)).text)) self.threads.add(id)
print(textwrap.fill(post.find("%smessage" % ns).text, self.insert(thread, res[id])
initial_indent=" ", subsequent_indent=" "))
print("") # in case a comment has been deleted (and no further childs)
self.db.comments._remove_stale()
sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[100%] {0} threads, {1} comments\n".format(
len(self.threads), len(self.comments)))
orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), tree.findall(Disqus.ns + "post"))) - self.comments
if orphans:
print("Found %i orphans:" % len(orphans))
for post in tree.findall(Disqus.ns + "post"):
if post.attrib.get(Disqus.internals + "id") not in orphans:
continue
print(" * {0} by {1} <{2}>".format(
post.attrib.get(Disqus.internals + "id"),
post.find("{0}author/{0}name".format(Disqus.ns)).text,
post.find("{0}author/{0}email".format(Disqus.ns)).text))
print(textwrap.fill(post.find(Disqus.ns + "message").text,
initial_indent=" ", subsequent_indent=" "))
print("")
def dispatch(db, dump):
if db.execute("SELECT * FROM comments").fetchone():
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
Disqus(db, dump).migrate()

View File

@ -6,7 +6,7 @@ from os.path import join, dirname
from isso.core import Config from isso.core import Config
from isso.db import SQLite3 from isso.db import SQLite3
from isso.migrate import disqus from isso.migrate import Disqus
def test_disqus(): def test_disqus():
@ -15,7 +15,7 @@ def test_disqus():
xxx = tempfile.NamedTemporaryFile() xxx = tempfile.NamedTemporaryFile()
db = SQLite3(xxx.name, Config.load(None)) db = SQLite3(xxx.name, Config.load(None))
disqus(db, xml) Disqus(db, xml).migrate()
assert db.threads["/"]["title"] == "Hello, World!" assert db.threads["/"]["title"] == "Hello, World!"
assert db.threads["/"]["id"] == 1 assert db.threads["/"]["id"] == 1