From 910da2a6c033b2a51fad5e51993943d746475a39 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Wed, 30 Apr 2014 15:07:11 +0200 Subject: [PATCH 1/7] refactor disqus migration code into a class --- isso/__init__.py | 14 +++- isso/migrate.py | 140 +++++++++++++++++++---------------- isso/tests/test_migration.py | 4 +- 3 files changed, 87 insertions(+), 71 deletions(-) diff --git a/isso/__init__.py b/isso/__init__.py index d4f7980..14fdcdb 100644 --- a/isso/__init__.py +++ b/isso/__init__.py @@ -214,11 +214,17 @@ def main(): conf = Config.load(args.conf) if args.command == "import": - xxx = tempfile.NamedTemporaryFile() - dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name - conf.set("guard", "enabled", "off") - migrate.disqus(db.SQLite3(dbpath, conf), args.dump) + + if args.dryrun: + xxx = tempfile.NamedTemporaryFile() + dbpath = xxx.name + else: + dbpath = conf.get("general", "dbpath") + + mydb = db.SQLite3(dbpath, conf) + migrate.dispatch(mydb, args.dump) + sys.exit(0) if not any(conf.getiter("general", "host")): diff --git a/isso/migrate.py b/isso/migrate.py index 93d645f..eb2fe41 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -21,94 +21,104 @@ except ImportError: from xml.etree import ElementTree -ns = '{http://disqus.com}' -dsq = '{http://disqus.com/disqus-internals}' -threads = set([]) -comments = set([]) +class Disqus(object): + ns = '{http://disqus.com}' + internals = '{http://disqus.com/disqus-internals}' -def insert(db, thread, posts): + def __init__(self, db, xmlfile): + self.threads = set([]) + self.comments = set([]) - path = urlparse(thread.find('%slink' % ns).text).path - remap = dict() + self.db = db + self.xmlfile = xmlfile - if path not in db.threads: - db.threads.new(path, thread.find('%stitle' % ns).text.strip()) + def insert(self, thread, posts): - for item in sorted(posts, key=lambda k: k['created']): + path = urlparse(thread.find('%slink' % Disqus.ns).text).path + remap = dict() - dsq_id = item.pop('dsq:id') - item['parent'] = remap.get(item.pop('dsq:parent', None)) - rv = db.comments.add(path, item) - remap[dsq_id] = rv["id"] + if path not in self.db.threads: + self.db.threads.new(path, thread.find(Disqus.ns + 'title').text.strip()) - comments.update(set(remap.keys())) + for item in sorted(posts, key=lambda k: k['created']): + dsq_id = item.pop('dsq:id') + item['parent'] = remap.get(item.pop('dsq:parent', None)) + rv = self.db.comments.add(path, item) + remap[dsq_id] = rv["id"] -def disqus(db, xmlfile): + self.comments.update(set(remap.keys())) - if db.execute("SELECT * FROM comments").fetchone(): - if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): - raise SystemExit("Abort.") + def migrate(self): - tree = ElementTree.parse(xmlfile) - res = defaultdict(list) + tree = ElementTree.parse(self.xmlfile) + res = defaultdict(list) - for post in tree.findall('%spost' % ns): + for post in tree.findall('%spost' % Disqus.ns): - item = { - 'dsq:id': post.attrib.get(dsq + 'id'), - 'text': post.find('%smessage' % ns).text, - 'author': post.find('%sauthor/%sname' % (ns, ns)).text, - 'email': post.find('%sauthor/%semail' % (ns, ns)).text, - 'created': mktime(strptime( - post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')), - 'remote_addr': '127.0.0.0', - 'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4 - } + item = { + 'dsq:id': post.attrib.get(Disqus.internals + 'id'), + 'text': post.find(Disqus.ns + 'message').text, + 'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text, + 'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text, + 'created': mktime(strptime( + post.find(Disqus.ns + 'createdAt').text, '%Y-%m-%dT%H:%M:%SZ')), + 'remote_addr': '127.0.0.0', + 'mode': 1 if post.find(Disqus.ns + "isDeleted").text == "false" else 4 + } - if post.find(ns + 'parent') is not None: - item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id') + if post.find(Disqus.ns + 'parent') is not None: + item['dsq:parent'] = post.find(Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id') - res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item) + res[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item) - num = len(tree.findall('%sthread' % ns)) - cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1]) + num = len(tree.findall(Disqus.ns + 'thread')) + cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1]) - for i, thread in enumerate(tree.findall('%sthread' % ns)): + for i, thread in enumerate(tree.findall(Disqus.ns + 'thread')): - if int(round((i+1)/num, 2) * 100) % 13 == 0: + if int(round((i+1)/num, 2) * 100) % 13 == 0: + sys.stdout.write("\r%s" % (" "*cols)) + sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find(Disqus.ns + 'id').text)) + sys.stdout.flush() - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text)) - sys.stdout.flush() + # skip (possibly?) duplicate, but empty thread elements + if thread.find(Disqus.ns + 'id').text is None: + continue - # skip (possibly?) duplicate, but empty thread elements - if thread.find('%sid' % ns).text is None: - continue + id = thread.attrib.get(Disqus.internals + 'id') + if id in res: + self.threads.add(id) + self.insert(thread, res[id]) - id = thread.attrib.get(dsq + 'id') - if id in res: - threads.add(id) - insert(db, thread, res[id]) + # in case a comment has been deleted (and no further childs) + self.db.comments._remove_stale() - # in case a comment has been deleted (and no further childs) - db.comments._remove_stale() + sys.stdout.write("\r%s" % (" "*cols)) + sys.stdout.write("\r[100%] {0} threads, {1} comments\n".format( + len(self.threads), len(self.comments))) - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments))) + orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), tree.findall(Disqus.ns + "post"))) - self.comments + if orphans: + print("Found %i orphans:" % len(orphans)) + for post in tree.findall(Disqus.ns + "post"): + if post.attrib.get(Disqus.internals + "id") not in orphans: + continue - orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments - if orphans: - print("Found %i orphans:" % len(orphans)) - for post in tree.findall("%spost" % ns): - if post.attrib.get(dsq + "id") not in orphans: - continue + print(" * {0} by {1} <{2}>".format( + post.attrib.get(Disqus.internals + "id"), + post.find("{0}author/{0}name".format(Disqus.ns)).text, + post.find("{0}author/{0}email".format(Disqus.ns)).text)) + print(textwrap.fill(post.find(Disqus.ns + "message").text, + initial_indent=" ", subsequent_indent=" ")) + print("") + + +def dispatch(db, dump): + if db.execute("SELECT * FROM comments").fetchone(): + if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): + raise SystemExit("Abort.") - print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"), - post.find("%sauthor/%sname" % (ns, ns)).text, - post.find("%sauthor/%semail" % (ns, ns)).text)) - print(textwrap.fill(post.find("%smessage" % ns).text, - initial_indent=" ", subsequent_indent=" ")) - print("") + Disqus(db, dump).migrate() diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index 8a40ea2..ef1af6b 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -6,7 +6,7 @@ from os.path import join, dirname from isso.core import Config from isso.db import SQLite3 -from isso.migrate import disqus +from isso.migrate import Disqus def test_disqus(): @@ -15,7 +15,7 @@ def test_disqus(): xxx = tempfile.NamedTemporaryFile() db = SQLite3(xxx.name, Config.load(None)) - disqus(db, xml) + Disqus(db, xml).migrate() assert db.threads["/"]["title"] == "Hello, World!" assert db.threads["/"]["id"] == 1 From 0154113c80d98f7212d8eecded3d2254246814a8 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Wed, 30 Apr 2014 15:24:16 +0200 Subject: [PATCH 2/7] replace assert with assertEqual --- isso/tests/test_migration.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index ef1af6b..cb4236a 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -1,5 +1,10 @@ # -*- encoding: utf-8 -*- +try: + import unittest2 as unittest +except ImportError: + import unittest + import tempfile from os.path import join, dirname @@ -9,16 +14,26 @@ from isso.db import SQLite3 from isso.migrate import Disqus -def test_disqus(): +class TestMigration(unittest.TestCase): + + def test_disqus(self): + + xml = join(dirname(__file__), "disqus.xml") + xxx = tempfile.NamedTemporaryFile() + + db = SQLite3(xxx.name, Config.load(None)) + Disqus(db, xml).migrate() + + self.assertEqual(db.threads["/"]["title"], "Hello, World!") + self.assertEqual(db.threads["/"]["id"], 1) - xml = join(dirname(__file__), "disqus.xml") - xxx = tempfile.NamedTemporaryFile() + a = db.comments.get(1) - db = SQLite3(xxx.name, Config.load(None)) - Disqus(db, xml).migrate() + self.assertEqual(a["author"], "peter") + self.assertEqual(a["email"], "foo@bar.com") - assert db.threads["/"]["title"] == "Hello, World!" - assert db.threads["/"]["id"] == 1 + b = db.comments.get(2) + self.assertEqual(b["parent"] ,a["id"]) a = db.comments.get(1) From 12f8af8434c9f38b0dd2cb96fef3157f0d1d0950 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Wed, 30 Apr 2014 18:46:44 +0200 Subject: [PATCH 3/7] add initial support to import WordPress comments --- isso/__init__.py | 4 +- isso/migrate.py | 130 +++++++++++++++++++++++++++++++++-- isso/tests/test_migration.py | 33 +++++++-- isso/tests/wordpress.xml | 119 ++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+), 12 deletions(-) create mode 100644 isso/tests/wordpress.xml diff --git a/isso/__init__.py b/isso/__init__.py index 14fdcdb..7f4f4d0 100644 --- a/isso/__init__.py +++ b/isso/__init__.py @@ -207,6 +207,8 @@ def main(): imprt.add_argument("dump", metavar="FILE") imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", help="perform a trial run with no changes made") + imprt.add_argument("-t", "--type", dest="type", default=None, + choices=["disqus", "wordpress"], help="export type") serve = subparser.add_parser("run", help="run server") @@ -223,7 +225,7 @@ def main(): dbpath = conf.get("general", "dbpath") mydb = db.SQLite3(dbpath, conf) - migrate.dispatch(mydb, args.dump) + migrate.dispatch(args.type, mydb, args.dump) sys.exit(0) diff --git a/isso/migrate.py b/isso/migrate.py index eb2fe41..b549238 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -1,14 +1,18 @@ # -*- encoding: utf-8 -*- -from __future__ import division +from __future__ import division, print_function import sys import os +import io import textwrap -from time import mktime, strptime +from time import mktime, strptime, time from collections import defaultdict +from isso.utils import anonymize +from isso.compat import string_types + try: input = raw_input except NameError: @@ -22,6 +26,39 @@ except ImportError: from xml.etree import ElementTree +def strip(val): + if isinstance(val, string_types): + return val.strip() + return val + + +class Progress(object): + + def __init__(self, end): + self.end = end or 1 + + self.istty = sys.stdout.isatty() + self.last = 0 + + def update(self, i, message): + + if not self.istty or message is None: + return + + cols = int((os.popen('stty size', 'r').read()).split()[1]) + message = message[:cols - 7] + + if time() - self.last > 0.2: + sys.stdout.write("\r{0}".format(" " * cols)) + sys.stdout.write("\r[{0:.0%}] {1}".format(i/self.end, message)) + sys.stdout.flush() + self.last = time() + + def finish(self, message): + self.last = 0 + self.update(self.end, message + "\n") + + class Disqus(object): ns = '{http://disqus.com}' @@ -116,9 +153,94 @@ class Disqus(object): print("") -def dispatch(db, dump): +class WordPress(object): + + ns = "{http://wordpress.org/export/1.0/}" + + def __init__(self, db, xmlfile): + self.db = db + self.xmlfile = xmlfile + self.count = 0 + + def insert(self, thread): + + path = urlparse(thread.find("link").text).path + self.db.threads.new(path, thread.find("title").text.strip()) + + comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment"))) + comments.sort(key=lambda k: k["id"]) + + remap = {} + ids = set(c["id"] for c in comments) + + self.count += len(ids) + + while comments: + for i, item in enumerate(comments): + if item["parent"] in ids: + continue + + item["parent"] = remap.get(item["parent"], None) + rv = self.db.comments.add(path, item) + remap[item["id"]] = rv["id"] + + ids.remove(item["id"]) + comments.pop(i) + + break + else: + # should never happen, but... it's WordPress. + return + + def migrate(self): + + tree = ElementTree.parse(self.xmlfile) + items = tree.findall("channel/item") + + progress = Progress(len(items)) + for i, thread in enumerate(items): + progress.update(i, thread.find("title").text) + self.insert(thread) + + progress.finish("{0} threads, {1} comments".format(len(items), self.count)) + + @classmethod + def Comment(cls, el): + return { + "text": strip(el.find(WordPress.ns + "comment_content").text), + "author": strip(el.find(WordPress.ns + "comment_author").text), + "email": strip(el.find(WordPress.ns + "comment_author_email").text), + "website": strip(el.find(WordPress.ns + "comment_author_url").text), + "remote_addr": anonymize( + strip(el.find(WordPress.ns + "comment_author_IP").text)), + "created": mktime(strptime( + strip(el.find(WordPress.ns + "comment_date_gmt").text), + "%Y-%m-%d %H:%M:%S")), + "mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2, + "id": int(el.find(WordPress.ns + "comment_id").text), + "parent": int(el.find(WordPress.ns + "comment_parent").text) or None + } + + +def dispatch(type, db, dump): if db.execute("SELECT * FROM comments").fetchone(): if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"): raise SystemExit("Abort.") - Disqus(db, dump).migrate() + if type is None: + + with io.open(dump) as fp: + peek = fp.read(2048) + + if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek: + type = "wordpress" + + if ' + + + 0 + 0 + + + 6 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:27 + 2014-04-29 15:21:27 + + 1 + + 0 + 1 + + + 7 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:35 + + 1 + 6 + + + 8 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:45 + 2014-04-29 15:21:45 + + + 1 + + 7 + 1 + + + 9 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:52 + 2014-04-29 15:21:52 + + 1 + + 7 + 1 + + + 10 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:56 + 2014-04-29 15:21:56 + + 1 + + 0 + 1 + + + + \ No newline at end of file From 39101c2ac7998f1f991ed865434e85a14579ca8f Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Fri, 2 May 2014 11:40:16 +0200 Subject: [PATCH 4/7] Disqus import uses Progressbar class as well --- isso/migrate.py | 15 ++++----------- isso/tests/test_migration.py | 2 ++ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/isso/migrate.py b/isso/migrate.py index b549238..2a985f1 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -93,7 +93,7 @@ class Disqus(object): tree = ElementTree.parse(self.xmlfile) res = defaultdict(list) - for post in tree.findall('%spost' % Disqus.ns): + for post in tree.findall(Disqus.ns + 'post'): item = { 'dsq:id': post.attrib.get(Disqus.internals + 'id'), @@ -111,15 +111,9 @@ class Disqus(object): res[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item) - num = len(tree.findall(Disqus.ns + 'thread')) - cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1]) - + progress = Progress(len(tree.findall(Disqus.ns + 'thread'))) for i, thread in enumerate(tree.findall(Disqus.ns + 'thread')): - - if int(round((i+1)/num, 2) * 100) % 13 == 0: - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find(Disqus.ns + 'id').text)) - sys.stdout.flush() + progress.update(i, thread.find(Disqus.ns + 'id').text) # skip (possibly?) duplicate, but empty thread elements if thread.find(Disqus.ns + 'id').text is None: @@ -133,8 +127,7 @@ class Disqus(object): # in case a comment has been deleted (and no further childs) self.db.comments._remove_stale() - sys.stdout.write("\r%s" % (" "*cols)) - sys.stdout.write("\r[100%] {0} threads, {1} comments\n".format( + progress.finish("{0} threads, {1} comments".format( len(self.threads), len(self.comments))) orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), tree.findall(Disqus.ns + "post"))) - self.comments diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index fd5a68c..29adbd5 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -24,6 +24,8 @@ class TestMigration(unittest.TestCase): db = SQLite3(xxx.name, Config.load(None)) Disqus(db, xml).migrate() + self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 2) + self.assertEqual(db.threads["/"]["title"], "Hello, World!") self.assertEqual(db.threads["/"]["id"], 1) From 346b60a9b39a37ecdebcd41b40dd3fd2c19addd0 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Fri, 2 May 2014 11:43:13 +0200 Subject: [PATCH 5/7] disqus import imports anonymized IP address now --- isso/migrate.py | 2 +- isso/tests/test_migration.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/isso/migrate.py b/isso/migrate.py index 2a985f1..a2089ff 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -102,7 +102,7 @@ class Disqus(object): 'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text, 'created': mktime(strptime( post.find(Disqus.ns + 'createdAt').text, '%Y-%m-%dT%H:%M:%SZ')), - 'remote_addr': '127.0.0.0', + 'remote_addr': anonymize(post.find(Disqus.ns + 'ipAddress').text), 'mode': 1 if post.find(Disqus.ns + "isDeleted").text == "false" else 4 } diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index 29adbd5..9634ae3 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -33,6 +33,7 @@ class TestMigration(unittest.TestCase): self.assertEqual(a["author"], "peter") self.assertEqual(a["email"], "foo@bar.com") + self.assertEqual(a["remote_addr"], "127.0.0.0") b = db.comments.get(2) self.assertEqual(b["parent"], a["id"]) @@ -53,6 +54,7 @@ class TestMigration(unittest.TestCase): first = db.comments.get(1) self.assertEqual(first["author"], "Ohai") self.assertEqual(first["text"], "Erster!1") + self.assertEqual(first["remote_addr"], "82.119.20.0") second = db.comments.get(2) self.assertEqual(second["author"], "Tester") From 333bba728b3a096d2d7805fb554221cec47aa3c3 Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Fri, 2 May 2014 11:57:47 +0200 Subject: [PATCH 6/7] update docs --- docs/docs/quickstart.rst | 24 ++++++++++++------------ docs/index.html | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/docs/quickstart.rst b/docs/docs/quickstart.rst index 59e31ce..e497b80 100644 --- a/docs/docs/quickstart.rst +++ b/docs/docs/quickstart.rst @@ -61,24 +61,24 @@ For more options, see :doc:`server ` and :doc:`client Migration --------- -You can migrate your existing comments from Disqus_. Log into Disqus, go to -your website, click on *Discussions* and select the *Export* tab. You'll -receive an email with your comments. Unfortunately, Disqus does not export -up- and downvotes. +You can import comments from Disqus_ or WordPress_. -To import existing comments, run Isso with your configuration file: +To export your comments from Disqus, log into Disqus, go to your website, click +on *Discussions* and select the *Export* tab. You'll receive an email with your +comments. Unfortunately, Disqus does not export up- and downvotes. -.. code-block:: sh +To export comments from your previous WordPress installation, go to *Tools*, +export your data. - ~> isso -c /path/to/isso.cfg import user-2013-09-02T11_39_22.971478-all.xml - [100%] 53 threads, 192 comments +Now import the XML dump: -Migration from WordPress_ is not possible, yet (WordPress does not export -comments). As a workaround, install the Disqus plugin, export to Disqus and -then migrate to Isso. +.. code-block:: sh -.. _Disqus: + ~> isso -c /path/to/isso.cfg import disqus-or-wordpress.xml + [100%] 53 threads, 192 comments +.. _Disqus: https://disqus.com/ +.. _WordPress: https://wordpress.org/ Running Isso ------------ diff --git a/docs/index.html b/docs/index.html index 9ed4a58..0732e58 100644 --- a/docs/index.html +++ b/docs/index.html @@ -25,8 +25,8 @@

Because comments are not Big Data.

  • -

    Disqus Import

    -

    You can migrate your Disqus comments without any hassle.

    +

    Disqus/WordPress Import

    +

    You can migrate your Disqus/WordPress comments without any hassle.

  • client-side JavaScript

    From 123ea26ca902bcaf8a290b1d130841b946f25ccc Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Fri, 2 May 2014 13:06:06 +0200 Subject: [PATCH 7/7] handle WP's query-string "pages" and variable WXR namespaces Site links such as /?p=1234 are imported *as is* and maybe do work in Isso. Do not use a query-based URL structure as permalinks. Ever. Also, depending on the pages you are going to export, WXR' XML namespace may change from ../export/1.0/ to ../export/1.2/. Isso tries to import any WXR 1.x --- docs/docs/quickstart.rst | 3 +- isso/migrate.py | 69 +++++++++++++++++++++++++++--------- isso/tests/test_migration.py | 29 ++++++++++++++- isso/tests/wordpress.xml | 25 +++++++++++++ 4 files changed, 107 insertions(+), 19 deletions(-) diff --git a/docs/docs/quickstart.rst b/docs/docs/quickstart.rst index e497b80..8fa9b0f 100644 --- a/docs/docs/quickstart.rst +++ b/docs/docs/quickstart.rst @@ -68,7 +68,8 @@ on *Discussions* and select the *Export* tab. You'll receive an email with your comments. Unfortunately, Disqus does not export up- and downvotes. To export comments from your previous WordPress installation, go to *Tools*, -export your data. +export your data. WordPress WXR import is quite new and may not work for you; +please report any failures. Now import the XML dump: diff --git a/isso/migrate.py b/isso/migrate.py index a2089ff..ed9fe1d 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -5,6 +5,7 @@ from __future__ import division, print_function import sys import os import io +import re import textwrap from time import mktime, strptime, time @@ -145,6 +146,14 @@ class Disqus(object): initial_indent=" ", subsequent_indent=" ")) print("") + @classmethod + def detect(cls, peek): + + if 'xmlns="http://disqus.com' in peek: + return "http://disqus.com" + + return None + class WordPress(object): @@ -155,12 +164,23 @@ class WordPress(object): self.xmlfile = xmlfile self.count = 0 + with io.open(xmlfile) as fp: + ns = WordPress.detect(fp.read(io.DEFAULT_BUFFER_SIZE)) + + if ns: + self.ns = "{" + ns + "}" + def insert(self, thread): - path = urlparse(thread.find("link").text).path + url = urlparse(thread.find("link").text) + path = url.path + + if url.query: + path += "?" + url.query + self.db.threads.new(path, thread.find("title").text.strip()) - comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment"))) + comments = list(map(self.Comment, thread.findall(self.ns + "comment"))) comments.sort(key=lambda k: k["id"]) remap = {} @@ -188,32 +208,47 @@ class WordPress(object): def migrate(self): tree = ElementTree.parse(self.xmlfile) + + skip = 0 items = tree.findall("channel/item") progress = Progress(len(items)) for i, thread in enumerate(items): + if thread.find("title").text is None or thread.find(self.ns + "comment") is None: + skip += 1 + continue + progress.update(i, thread.find("title").text) self.insert(thread) - progress.finish("{0} threads, {1} comments".format(len(items), self.count)) + progress.finish("{0} threads, {1} comments".format( + len(items) - skip, self.count)) - @classmethod - def Comment(cls, el): + def Comment(self, el): return { - "text": strip(el.find(WordPress.ns + "comment_content").text), - "author": strip(el.find(WordPress.ns + "comment_author").text), - "email": strip(el.find(WordPress.ns + "comment_author_email").text), - "website": strip(el.find(WordPress.ns + "comment_author_url").text), + "text": strip(el.find(self.ns + "comment_content").text), + "author": strip(el.find(self.ns + "comment_author").text), + "email": strip(el.find(self.ns + "comment_author_email").text), + "website": strip(el.find(self.ns + "comment_author_url").text), "remote_addr": anonymize( - strip(el.find(WordPress.ns + "comment_author_IP").text)), + strip(el.find(self.ns + "comment_author_IP").text)), "created": mktime(strptime( - strip(el.find(WordPress.ns + "comment_date_gmt").text), + strip(el.find(self.ns + "comment_date_gmt").text), "%Y-%m-%d %H:%M:%S")), - "mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2, - "id": int(el.find(WordPress.ns + "comment_id").text), - "parent": int(el.find(WordPress.ns + "comment_parent").text) or None + "mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2, + "id": int(el.find(self.ns + "comment_id").text), + "parent": int(el.find(self.ns + "comment_parent").text) or None } + @classmethod + def detect(cls, peek): + + m = re.search("http://wordpress.org/export/1\.\d/", peek) + if m: + return m.group(0) + + return None + def dispatch(type, db, dump): if db.execute("SELECT * FROM comments").fetchone(): @@ -223,12 +258,12 @@ def dispatch(type, db, dump): if type is None: with io.open(dump) as fp: - peek = fp.read(2048) + peek = fp.read(io.DEFAULT_BUFFER_SIZE) - if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek: + if WordPress.detect(peek): type = "wordpress" - if ' + 1 + + + + ... + http://example.tld/?p=4 + + 11 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:56 + 2014-04-29 15:21:57 + + 1 + + 0 + 1 + + + + No comments + http://example.tld/?p=6 + \ No newline at end of file