Merge branch 'feature/migration'

This commit is contained in:
Martin Zimmermann 2013-11-11 12:10:42 +01:00
commit dfed955f23
2 changed files with 45 additions and 13 deletions

View File

@ -42,6 +42,7 @@ import sys
import os import os
import socket import socket
import logging import logging
import tempfile
from os.path import dirname, join from os.path import dirname, join
from argparse import ArgumentParser from argparse import ArgumentParser
@ -185,6 +186,8 @@ def main():
imprt = subparser.add_parser('import', help="import Disqus XML export") imprt = subparser.add_parser('import', help="import Disqus XML export")
imprt.add_argument("dump", metavar="FILE") imprt.add_argument("dump", metavar="FILE")
imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true",
help="perform a trial run with no changes made")
serve = subparser.add_parser("run", help="run server") serve = subparser.add_parser("run", help="run server")
@ -192,8 +195,11 @@ def main():
conf = Config.load(args.conf) conf = Config.load(args.conf)
if args.command == "import": if args.command == "import":
xxx = tempfile.NamedTemporaryFile()
dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name
conf.set("guard", "enabled", "off") conf.set("guard", "enabled", "off")
migrate.disqus(db.SQLite3(conf.get('general', 'dbpath'), conf), args.dump) migrate.disqus(db.SQLite3(dbpath, conf), args.dump)
sys.exit(0) sys.exit(0)
if conf.get("server", "listen").startswith("http://"): if conf.get("server", "listen").startswith("http://"):

View File

@ -1,18 +1,19 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
#
# TODO
#
# - export does not include website from commenters
# - Disqus includes already deleted comments
from __future__ import division from __future__ import division
import sys import sys
import os import os
import textwrap
from time import mktime, strptime from time import mktime, strptime
from collections import defaultdict from collections import defaultdict
try:
input = raw_input
except NameError:
pass
try: try:
from urlparse import urlparse from urlparse import urlparse
except ImportError: except ImportError:
@ -20,12 +21,14 @@ except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
ns = '{http://disqus.com}' ns = '{http://disqus.com}'
dsq = '{http://disqus.com/disqus-internals}' dsq = '{http://disqus.com/disqus-internals}'
threads = set([])
comments = set([])
def insert(db, thread, comments):
def insert(db, thread, posts):
path = urlparse(thread.find('%sid' % ns).text).path path = urlparse(thread.find('%sid' % ns).text).path
remap = dict() remap = dict()
@ -33,16 +36,22 @@ def insert(db, thread, comments):
if path not in db.threads: if path not in db.threads:
db.threads.new(path, thread.find('%stitle' % ns).text.strip()) db.threads.new(path, thread.find('%stitle' % ns).text.strip())
for item in sorted(comments, key=lambda k: k['created']): for item in sorted(posts, key=lambda k: k['created']):
dsq_id = item.pop('dsq:id') dsq_id = item.pop('dsq:id')
item['parent'] = remap.get(item.pop('dsq:parent', None)) item['parent'] = remap.get(item.pop('dsq:parent', None))
rv = db.comments.add(path, item) rv = db.comments.add(path, item)
remap[dsq_id] = rv["id"] remap[dsq_id] = rv["id"]
comments.update(set(remap.keys()))
def disqus(db, xmlfile): def disqus(db, xmlfile):
if db.execute("SELECT * FROM comments").fetchone():
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
tree = ElementTree.parse(xmlfile) tree = ElementTree.parse(xmlfile)
res = defaultdict(list) res = defaultdict(list)
@ -56,7 +65,7 @@ def disqus(db, xmlfile):
'created': mktime(strptime( 'created': mktime(strptime(
post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')), post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')),
'remote_addr': '127.0.0.0', 'remote_addr': '127.0.0.0',
'mode': 1 'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4
} }
if post.find(ns + 'parent') is not None: if post.find(ns + 'parent') is not None:
@ -67,7 +76,6 @@ def disqus(db, xmlfile):
num = len(tree.findall('%sthread' % ns)) num = len(tree.findall('%sthread' % ns))
cols = int(os.popen('stty size', 'r').read().split()[1]) cols = int(os.popen('stty size', 'r').read().split()[1])
threads = 0
for i, thread in enumerate(tree.findall('%sthread' % ns)): for i, thread in enumerate(tree.findall('%sthread' % ns)):
if int(round((i+1)/num, 2) * 100) % 13 == 0: if int(round((i+1)/num, 2) * 100) % 13 == 0:
@ -76,13 +84,31 @@ def disqus(db, xmlfile):
sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text)) sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text))
sys.stdout.flush() sys.stdout.flush()
# skip (possibly?) duplicate, but empty thread elements
if thread.find('%sid' % ns).text is None: if thread.find('%sid' % ns).text is None:
continue continue
id = thread.attrib.get(dsq + 'id') id = thread.attrib.get(dsq + 'id')
if id in res: if id in res:
threads += 1 threads.add(id)
insert(db, thread, res[id]) insert(db, thread, res[id])
# in case a comment has been deleted (and no further childs)
db.comments._remove_stale()
sys.stdout.write("\r%s" % (" "*cols)) sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[100%%] %i threads, %i comments" % (threads, len(tree.findall('%spost' % ns)))) sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments)))
orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments
if orphans:
print("Found %i orphans:" % len(orphans))
for post in tree.findall("%spost" % ns):
if post.attrib.get(dsq + "id") not in orphans:
continue
print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"),
post.find("%sauthor/%sname" % (ns, ns)).text,
post.find("%sauthor/%semail" % (ns, ns)).text))
print(textwrap.fill(post.find("%smessage" % ns).text,
initial_indent=" ", subsequent_indent=" "))
print("")