show orphan comments after Disqus import (if any)

An orphan comment is exported by Disqus but its thread id is
non-existent (probably deleted, moved). Usually from the earlier
days (or WordPress migration).

It is not possible to get the thread without manual intervention (
aka SQLite insertions).
This commit is contained in:
Martin Zimmermann 2013-11-11 11:34:13 +01:00
parent 29170ac258
commit 8c0af3b10f

View File

@ -9,6 +9,7 @@ from __future__ import division
import sys import sys
import os import os
import textwrap
from time import mktime, strptime from time import mktime, strptime
from collections import defaultdict from collections import defaultdict
@ -20,12 +21,14 @@ except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
ns = '{http://disqus.com}' ns = '{http://disqus.com}'
dsq = '{http://disqus.com/disqus-internals}' dsq = '{http://disqus.com/disqus-internals}'
threads = set([])
comments = set([])
def insert(db, thread, comments):
def insert(db, thread, posts):
path = urlparse(thread.find('%sid' % ns).text).path path = urlparse(thread.find('%sid' % ns).text).path
remap = dict() remap = dict()
@ -33,13 +36,15 @@ def insert(db, thread, comments):
if path not in db.threads: if path not in db.threads:
db.threads.new(path, thread.find('%stitle' % ns).text.strip()) db.threads.new(path, thread.find('%stitle' % ns).text.strip())
for item in sorted(comments, key=lambda k: k['created']): for item in sorted(posts, key=lambda k: k['created']):
dsq_id = item.pop('dsq:id') dsq_id = item.pop('dsq:id')
item['parent'] = remap.get(item.pop('dsq:parent', None)) item['parent'] = remap.get(item.pop('dsq:parent', None))
rv = db.comments.add(path, item) rv = db.comments.add(path, item)
remap[dsq_id] = rv["id"] remap[dsq_id] = rv["id"]
comments.update(set(remap.keys()))
def disqus(db, xmlfile): def disqus(db, xmlfile):
@ -67,7 +72,6 @@ def disqus(db, xmlfile):
num = len(tree.findall('%sthread' % ns)) num = len(tree.findall('%sthread' % ns))
cols = int(os.popen('stty size', 'r').read().split()[1]) cols = int(os.popen('stty size', 'r').read().split()[1])
threads = 0
for i, thread in enumerate(tree.findall('%sthread' % ns)): for i, thread in enumerate(tree.findall('%sthread' % ns)):
if int(round((i+1)/num, 2) * 100) % 13 == 0: if int(round((i+1)/num, 2) * 100) % 13 == 0:
@ -76,13 +80,28 @@ def disqus(db, xmlfile):
sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text)) sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text))
sys.stdout.flush() sys.stdout.flush()
# skip (possibly?) duplicate, but empty thread elements
if thread.find('%sid' % ns).text is None: if thread.find('%sid' % ns).text is None:
continue continue
id = thread.attrib.get(dsq + 'id') id = thread.attrib.get(dsq + 'id')
if id in res: if id in res:
threads += 1 threads.add(id)
insert(db, thread, res[id]) insert(db, thread, res[id])
sys.stdout.write("\r%s" % (" "*cols)) sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[100%%] %i threads, %i comments" % (threads, len(tree.findall('%spost' % ns)))) sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments)))
orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments
if orphans:
print("Found %i orphans:" % len(orphans))
for post in tree.findall("%spost" % ns):
if post.attrib.get(dsq + "id") not in orphans:
continue
print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"),
post.find("%sauthor/%sname" % (ns, ns)).text,
post.find("%sauthor/%semail" % (ns, ns)).text))
print(textwrap.fill(post.find("%smessage" % ns).text,
initial_indent=" ", subsequent_indent=" "))
print("")