From 770dbf48afee81b067443dd262276fd81dfa10a5 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Thu, 12 Dec 2013 12:24:37 +0100 Subject: [PATCH] wynaut import --- isso/__init__.py | 15 +--- isso/migrate.py | 114 ------------------------------- setup.py | 6 +- specs/test_migration.py | 7 +- wynaut/__init__.py | 52 ++++++++++++++ wynaut/imprt/__init__.py | 143 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 204 insertions(+), 133 deletions(-) delete mode 100644 isso/migrate.py create mode 100644 wynaut/__init__.py create mode 100644 wynaut/imprt/__init__.py diff --git a/isso/__init__.py b/isso/__init__.py index c77b07e..cf7aecd 100644 --- a/isso/__init__.py +++ b/isso/__init__.py @@ -62,7 +62,7 @@ from werkzeug.contrib.profiler import ProfilerMiddleware local = Local() local_manager = LocalManager([local]) -from isso import db, migrate, wsgi, ext, views +from isso import db, wsgi, ext, views from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config from isso.utils import parse, http, JSONRequest, origin from isso.views import comments @@ -194,24 +194,11 @@ def main(): parser.add_argument("-c", dest="conf", default="/etc/isso.conf", metavar="/etc/isso.conf", help="set configuration file") - imprt = subparser.add_parser('import', help="import Disqus XML export") - imprt.add_argument("dump", metavar="FILE") - imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", - help="perform a trial run with no changes made") - serve = subparser.add_parser("run", help="run server") args = parser.parse_args() conf = Config.load(args.conf) - if args.command == "import": - xxx = tempfile.NamedTemporaryFile() - dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name - - conf.set("guard", "enabled", "off") - migrate.disqus(db.SQLite3(dbpath, conf), args.dump) - sys.exit(0) - if conf.get("server", "listen").startswith("http://"): host, port, _ = parse.host(conf.get("server", "listen")) try: diff --git a/isso/migrate.py b/isso/migrate.py deleted file mode 100644 index 93d645f..0000000 --- a/isso/migrate.py +++ /dev/null @@ -1,114 +0,0 @@ -# -*- encoding: utf-8 -*- - -from __future__ import division - -import sys -import os -import textwrap - -from time import mktime, strptime -from collections import defaultdict - -try: - input = raw_input -except NameError: - pass - -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse - -from xml.etree import ElementTree - -ns = '{http://disqus.com}' -dsq = '{http://disqus.com/disqus-internals}' - -threads = set([]) -comments = set([]) - - -def insert(db, thread, posts): - - path = urlparse(thread.find('%slink' % ns).text).path - remap = dict() - - if path not in db.threads: - db.threads.new(path, thread.find('%stitle' % ns).text.strip()) - - for item in sorted(posts, key=lambda k: k['created']): - - dsq_id = item.pop('dsq:id') - item['parent'] = remap.get(item.pop('dsq:parent', None)) - rv = db.comments.add(path, item) - remap[dsq_id] = rv["id"] - - comments.update(set(remap.keys())) - - -def disqus(db, xmlfile): - - if db.execute("SELECT * FROM comments").fetchone(): - if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): - raise SystemExit("Abort.") - - tree = ElementTree.parse(xmlfile) - res = defaultdict(list) - - for post in tree.findall('%spost' % ns): - - item = { - 'dsq:id': post.attrib.get(dsq + 'id'), - 'text': post.find('%smessage' % ns).text, - 'author': post.find('%sauthor/%sname' % (ns, ns)).text, - 'email': post.find('%sauthor/%semail' % (ns, ns)).text, - 'created': mktime(strptime( - post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')), - 'remote_addr': '127.0.0.0', - 'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4 - } - - if post.find(ns + 'parent') is not None: - item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id') - - res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item) - - num = len(tree.findall('%sthread' % ns)) - cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1]) - - for i, thread in enumerate(tree.findall('%sthread' % ns)): - - if int(round((i+1)/num, 2) * 100) % 13 == 0: - - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text)) - sys.stdout.flush() - - # skip (possibly?) duplicate, but empty thread elements - if thread.find('%sid' % ns).text is None: - continue - - id = thread.attrib.get(dsq + 'id') - if id in res: - threads.add(id) - insert(db, thread, res[id]) - - # in case a comment has been deleted (and no further childs) - db.comments._remove_stale() - - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments))) - - orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments - if orphans: - print("Found %i orphans:" % len(orphans)) - for post in tree.findall("%spost" % ns): - if post.attrib.get(dsq + "id") not in orphans: - continue - - print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"), - post.find("%sauthor/%sname" % (ns, ns)).text, - post.find("%sauthor/%semail" % (ns, ns)).text)) - print(textwrap.fill(post.find("%smessage" % ns).text, - initial_indent=" ", subsequent_indent=" ")) - print("") diff --git a/setup.py b/setup.py index 83c7abc..5037c3a 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ setup( ], install_requires=requires, entry_points={ - 'console_scripts': - ['isso = isso:main'], + 'console_scripts': [ + 'isso = isso:main', + 'wynaut = wynaut:main' + ], }, ) diff --git a/specs/test_migration.py b/specs/test_migration.py index 8a40ea2..894d2c6 100644 --- a/specs/test_migration.py +++ b/specs/test_migration.py @@ -6,7 +6,7 @@ from os.path import join, dirname from isso.core import Config from isso.db import SQLite3 -from isso.migrate import disqus +from wynaut.imprt import Disqus def test_disqus(): @@ -15,12 +15,13 @@ def test_disqus(): xxx = tempfile.NamedTemporaryFile() db = SQLite3(xxx.name, Config.load(None)) - disqus(db, xml) + + dsq = Disqus(xml) + dsq.migrate(db) assert db.threads["/"]["title"] == "Hello, World!" assert db.threads["/"]["id"] == 1 - a = db.comments.get(1) assert a["author"] == "peter" diff --git a/wynaut/__init__.py b/wynaut/__init__.py new file mode 100644 index 0000000..22c0d4e --- /dev/null +++ b/wynaut/__init__.py @@ -0,0 +1,52 @@ +# -*- encoding: utf-8 -*- + +import pkg_resources +dist = pkg_resources.get_distribution("isso") + +import os +import tempfile + +from argparse import ArgumentParser + +from isso.db import SQLite3 +from isso.core import Config + +from wynaut.imprt import Disqus + +try: + input = raw_input +except NameError: + pass + + +def main(): + + parser = ArgumentParser(description="manage Isso") + subparser = parser.add_subparsers(help="commands", dest="command") + + parser.add_argument('--version', action='version', version='%(prog)s' + dist.version) + parser.add_argument('-c', dest="conf", default=os.environ.get("ISSO_SETTINGS"), + metavar="/etc/isso.conf", help="set configuration file") + + imprt = subparser.add_parser('import', help="import Disqus XML export") + imprt.add_argument("dump", metavar="FILE") + imprt.add_argument("-f", "--force", dest="force", action="store_true", + help="force actions") + imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", + help="perform a trial run with no changes made") + + args = parser.parse_args() + conf = Config.load(args.conf) + + if args.command == "import": + xxx = tempfile.NamedTemporaryFile() + dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name + + dsq = Disqus(args.dump) + db = SQLite3(dbpath, conf) + + if db.execute("SELECT * FROM comments").fetchone(): + if not args.force and input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): + raise SystemExit("Abort.") + + dsq.migrate(db) diff --git a/wynaut/imprt/__init__.py b/wynaut/imprt/__init__.py new file mode 100644 index 0000000..334f979 --- /dev/null +++ b/wynaut/imprt/__init__.py @@ -0,0 +1,143 @@ +# -*- encoding: utf-8 -*- + +from __future__ import division + +import sys +import os +import time +import textwrap + +from time import mktime, strptime +from xml.etree import ElementTree +from collections import defaultdict + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse + +from werkzeug.utils import cached_property + + +class Import(object): + + def __init__(self): + + self.last = 0 + + try: + self.cols = int(os.popen('stty size', 'r').read().split()[1]) + except IndexError: + self.cols = 25 + + def progress(self, current, max, msg): + + if time.time() - self.last < 0.1: + return + + sys.stdout.write("\r{0}".format(" "*self.cols)) + sys.stdout.write("\r[{0:.3}%] {1:.{2}}".format( + current/max*100, msg.strip(), self.cols - 9)) + sys.stdout.flush() + + self.last = time.time() + + def done(self, msg): + sys.stdout.write("\r{0}".format(" "*self.cols)) + sys.stdout.write("\r[100%] {0}\n".format(msg.strip())) + sys.stdout.flush() + + +class Disqus(Import): + + ns = '{http://disqus.com}' + internals = '{http://disqus.com/disqus-internals}' + + def __init__(self, xmlfile): + + super(Disqus, self).__init__() + self.tree = ElementTree.parse(xmlfile) + + self._threads = set([]) + self._posts = set([]) + + + @cached_property + def threads(self): + return [thr for thr in self.tree.findall("{0}thread".format(Disqus.ns)) + if thr.find("{0}id".format(Disqus.ns)).text is not None] + + @cached_property + def posts(self): + return self.tree.findall("{0}post".format(Disqus.ns)) + + def migrate(self, db): + + # map thread id to list of posts + rv = defaultdict(list) + + for post in self.posts: + + item = { + 'dsq:id': post.attrib.get(Disqus.internals + 'id'), + 'text': post.find('%smessage' % Disqus.ns).text, + 'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text, + 'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text, + 'created': mktime(strptime( + post.find('%screatedAt' % Disqus.ns).text, '%Y-%m-%dT%H:%M:%SZ')), + 'remote_addr': '127.0.0.0', + 'mode': 1 if post.find("%sisDeleted" % Disqus.ns).text == "false" else 4 + } + + if post.find(Disqus.ns + 'parent') is not None: + item['dsq:parent'] = post.find(Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id') + + rv[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item) + + for i, thread in enumerate(self.threads): + + self.progress(i, len(self.threads), thread.find('{0}id'.format(Disqus.ns)).text) + + # skip (possibly?) duplicate, but empty thread elements + if thread.find('%sid' % Disqus.ns).text is None: + continue + + id = thread.attrib.get(Disqus.internals + 'id') + if id in rv: + self._threads.add(id) + self._insert(db, thread, rv[id]) + + # in case a comment has been deleted (and no further childs) + db.comments._remove_stale() + + self.done("{0} threads, {1} comments".format(len(self._threads), len(self._posts))) + + orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), self.posts)) - self._posts + if orphans: + print("Found %i orphans:" % len(orphans)) + for post in self.posts: + if post.attrib.get(Disqus.internals + "id") not in orphans: + continue + + print(" * %s by %s <%s>" % (post.attrib.get(Disqus.internals + "id"), + post.find("{0}author/{0}name".format(Disqus.ns)).text, + post.find("{0}author/{0}email".format(Disqus.ns)).text)) + print(textwrap.fill(post.find("%smessage" % Disqus.ns).text, + initial_indent=" ", subsequent_indent=" ")) + + def _insert(self, db, thread, posts): + + path = urlparse(thread.find('%slink' % Disqus.ns).text).path + remap = dict() + + if path not in db.threads: + db.threads.new(path, thread.find('%stitle' % Disqus.ns).text.strip()) + + for item in sorted(posts, key=lambda k: k['created']): + + dsq_id = item.pop('dsq:id') + item['parent'] = remap.get(item.pop('dsq:parent', None)) + rv = db.comments.add(path, item) + remap[dsq_id] = rv["id"] + + self._posts.update(set(remap.keys()))