# -*- encoding: utf-8 -*- from __future__ import division, print_function, unicode_literals import sys import os import io import re import textwrap from time import mktime, strptime, time from collections import defaultdict from isso.utils import anonymize from isso.compat import string_types try: input = raw_input except NameError: pass try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse from xml.etree import ElementTree def strip(val): if isinstance(val, string_types): return val.strip() return val class Progress(object): def __init__(self, end): self.end = end or 1 self.istty = sys.stdout.isatty() self.last = 0 def update(self, i, message): if not self.istty or message is None: return cols = int((os.popen('stty size', 'r').read()).split()[1]) message = message[:cols - 7] if time() - self.last > 0.2: sys.stdout.write("\r{0}".format(" " * cols)) sys.stdout.write("\r[{0:.0%}] {1}".format(i/self.end, message)) sys.stdout.flush() self.last = time() def finish(self, message): self.last = 0 self.update(self.end, message + "\n") class Disqus(object): ns = '{http://disqus.com}' internals = '{http://disqus.com/disqus-internals}' def __init__(self, db, xmlfile, empty_id=False): self.threads = set([]) self.comments = set([]) self.db = db self.xmlfile = xmlfile self.empty_id = empty_id def insert(self, thread, posts): path = urlparse(thread.find('%slink' % Disqus.ns).text).path remap = dict() if path not in self.db.threads: self.db.threads.new(path, thread.find(Disqus.ns + 'title').text.strip()) for item in sorted(posts, key=lambda k: k['created']): dsq_id = item.pop('dsq:id') item['parent'] = remap.get(item.pop('dsq:parent', None)) rv = self.db.comments.add(path, item) remap[dsq_id] = rv["id"] self.comments.update(set(remap.keys())) def migrate(self): tree = ElementTree.parse(self.xmlfile) res = defaultdict(list) for post in tree.findall(Disqus.ns + 'post'): item = { 'dsq:id': post.attrib.get(Disqus.internals + 'id'), 'text': post.find(Disqus.ns + 'message').text, 'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text, 'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text, 'created': mktime(strptime( post.find(Disqus.ns + 'createdAt').text, '%Y-%m-%dT%H:%M:%SZ')), 'remote_addr': anonymize(post.find(Disqus.ns + 'ipAddress').text), 'mode': 1 if post.find(Disqus.ns + "isDeleted").text == "false" else 4 } if post.find(Disqus.ns + 'parent') is not None: item['dsq:parent'] = post.find(Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id') res[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item) progress = Progress(len(tree.findall(Disqus.ns + 'thread'))) for i, thread in enumerate(tree.findall(Disqus.ns + 'thread')): progress.update(i, thread.find(Disqus.ns + 'id').text) # skip (possibly?) duplicate, but empty thread elements if thread.find(Disqus.ns + 'id').text is None and not self.empty_id: continue id = thread.attrib.get(Disqus.internals + 'id') if id in res: self.threads.add(id) self.insert(thread, res[id]) # in case a comment has been deleted (and no further childs) self.db.comments._remove_stale() progress.finish("{0} threads, {1} comments".format( len(self.threads), len(self.comments))) orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), tree.findall(Disqus.ns + "post"))) - self.comments if orphans and not self.threads: print("Isso couldn't import any thread, try again with --empty-id") elif orphans: print("Found %i orphans:" % len(orphans)) for post in tree.findall(Disqus.ns + "post"): if post.attrib.get(Disqus.internals + "id") not in orphans: continue print(" * {0} by {1} <{2}>".format( post.attrib.get(Disqus.internals + "id"), post.find("{0}author/{0}name".format(Disqus.ns)).text, post.find("{0}author/{0}email".format(Disqus.ns)).text)) print(textwrap.fill(post.find(Disqus.ns + "message").text, initial_indent=" ", subsequent_indent=" ")) print("") @classmethod def detect(cls, peek): if 'xmlns="http://disqus.com' in peek: return "http://disqus.com" return None class WordPress(object): ns = "{http://wordpress.org/export/1.0/}" def __init__(self, db, xmlfile): self.db = db self.xmlfile = xmlfile self.count = 0 with io.open(xmlfile, encoding="utf-8") as fp: ns = WordPress.detect(fp.read(io.DEFAULT_BUFFER_SIZE)) if ns: self.ns = "{" + ns + "}" def insert(self, thread): url = urlparse(thread.find("link").text) path = url.path if url.query: path += "?" + url.query self.db.threads.new(path, thread.find("title").text.strip()) comments = list(map(self.Comment, thread.findall(self.ns + "comment"))) comments.sort(key=lambda k: k["id"]) remap = {} ids = set(c["id"] for c in comments) self.count += len(ids) while comments: for i, item in enumerate(comments): if item["parent"] in ids: continue item["parent"] = remap.get(item["parent"], None) rv = self.db.comments.add(path, item) remap[item["id"]] = rv["id"] ids.remove(item["id"]) comments.pop(i) break else: # should never happen, but... it's WordPress. return def migrate(self): tree = ElementTree.parse(self.xmlfile) skip = 0 items = tree.findall("channel/item") progress = Progress(len(items)) for i, thread in enumerate(items): if thread.find("title").text is None or thread.find(self.ns + "comment") is None: skip += 1 continue progress.update(i, thread.find("title").text) self.insert(thread) progress.finish("{0} threads, {1} comments".format( len(items) - skip, self.count)) def Comment(self, el): return { "text": strip(el.find(self.ns + "comment_content").text), "author": strip(el.find(self.ns + "comment_author").text), "email": strip(el.find(self.ns + "comment_author_email").text), "website": strip(el.find(self.ns + "comment_author_url").text), "remote_addr": anonymize( strip(el.find(self.ns + "comment_author_IP").text)), "created": mktime(strptime( strip(el.find(self.ns + "comment_date_gmt").text), "%Y-%m-%d %H:%M:%S")), "mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2, "id": int(el.find(self.ns + "comment_id").text), "parent": int(el.find(self.ns + "comment_parent").text) or None } @classmethod def detect(cls, peek): m = re.search("http://wordpress.org/export/1\.\d/", peek) if m: return m.group(0) return None def dispatch(type, db, dump, empty_id=False): if db.execute("SELECT * FROM comments").fetchone(): if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): raise SystemExit("Abort.") if type is None: with io.open(dump, encoding="utf-8") as fp: peek = fp.read(io.DEFAULT_BUFFER_SIZE) if WordPress.detect(peek): type = "wordpress" if Disqus.detect(peek): type = "disqus" if type == "wordpress": WordPress(db, dump).migrate() elif type == "disqus": Disqus(db, dump, empty_id).migrate() else: raise SystemExit("Unknown format, abort.")