handle WP's query-string "pages" and variable WXR namespaces

Site links such as /?p=1234 are imported *as is* and maybe do work in Isso. Do not use a query-based URL structure as permalinks. Ever. Also, depending on the pages you are going to export, WXR' XML namespace may change from ../export/1.0/ to ../export/1.2/. Isso tries to import any WXR 1.x
2014-05-02 13:06:06 +02:00 · 2014-05-02 13:06:06 +02:00 · 123ea26ca9
commit 123ea26ca9
parent 333bba728b
4 changed files with 111 additions and 23 deletions
--- a/docs/docs/quickstart.rst
+++ b/docs/docs/quickstart.rst
@ -68,7 +68,8 @@ on *Discussions* and select the *Export* tab. You'll receive an email with your
 comments. Unfortunately, Disqus does not export up- and downvotes.

 To export comments from your previous WordPress installation, go to *Tools*,
-export your data.
+export your data. WordPress WXR import is quite new and may not work for you;
+please report any failures.

 Now import the XML dump:

--- a/isso/migrate.py
+++ b/isso/migrate.py
@ -5,6 +5,7 @@ from __future__ import division, print_function
 import sys
 import os
 import io
+import re
 import textwrap

 from time import mktime, strptime, time
@ -145,6 +146,14 @@ class Disqus(object):
                                    initial_indent="  ", subsequent_indent="  "))
                print("")

+    @classmethod
+    def detect(cls, peek):
+
+        if 'xmlns="http://disqus.com' in peek:
+            return "http://disqus.com"
+
+        return None
+

 class WordPress(object):

@ -155,12 +164,23 @@ class WordPress(object):
        self.xmlfile = xmlfile
        self.count = 0

+        with io.open(xmlfile) as fp:
+            ns = WordPress.detect(fp.read(io.DEFAULT_BUFFER_SIZE))
+
+        if ns:
+            self.ns = "{" + ns + "}"
+
    def insert(self, thread):

-        path = urlparse(thread.find("link").text).path
+        url = urlparse(thread.find("link").text)
+        path = url.path
+
+        if url.query:
+            path += "?" + url.query
+
        self.db.threads.new(path, thread.find("title").text.strip())

-        comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment")))
+        comments = list(map(self.Comment, thread.findall(self.ns + "comment")))
        comments.sort(key=lambda k: k["id"])

        remap = {}
@ -188,31 +208,46 @@ class WordPress(object):
    def migrate(self):

        tree = ElementTree.parse(self.xmlfile)
+
+        skip = 0
        items = tree.findall("channel/item")

        progress = Progress(len(items))
        for i, thread in enumerate(items):
+            if thread.find("title").text is None or thread.find(self.ns + "comment") is None:
+                skip += 1
+                continue
+
            progress.update(i, thread.find("title").text)
            self.insert(thread)

-        progress.finish("{0} threads, {1} comments".format(len(items), self.count))
+        progress.finish("{0} threads, {1} comments".format(
+            len(items) - skip, self.count))
+
+    def Comment(self, el):
+        return {
+            "text": strip(el.find(self.ns + "comment_content").text),
+            "author": strip(el.find(self.ns + "comment_author").text),
+            "email": strip(el.find(self.ns + "comment_author_email").text),
+            "website": strip(el.find(self.ns + "comment_author_url").text),
+            "remote_addr": anonymize(
+                strip(el.find(self.ns + "comment_author_IP").text)),
+            "created": mktime(strptime(
+                strip(el.find(self.ns + "comment_date_gmt").text),
+                "%Y-%m-%d %H:%M:%S")),
+            "mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2,
+            "id": int(el.find(self.ns + "comment_id").text),
+            "parent": int(el.find(self.ns + "comment_parent").text) or None
+        }

    @classmethod
-    def Comment(cls, el):
-        return {
-            "text": strip(el.find(WordPress.ns + "comment_content").text),
-            "author": strip(el.find(WordPress.ns + "comment_author").text),
-            "email": strip(el.find(WordPress.ns + "comment_author_email").text),
-            "website": strip(el.find(WordPress.ns + "comment_author_url").text),
-            "remote_addr": anonymize(
-                strip(el.find(WordPress.ns + "comment_author_IP").text)),
-            "created": mktime(strptime(
-                strip(el.find(WordPress.ns + "comment_date_gmt").text),
-                "%Y-%m-%d %H:%M:%S")),
-            "mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2,
-            "id": int(el.find(WordPress.ns + "comment_id").text),
-            "parent": int(el.find(WordPress.ns + "comment_parent").text) or None
-        }
+    def detect(cls, peek):
+
+        m = re.search("http://wordpress.org/export/1\.\d/", peek)
+        if m:
+            return m.group(0)
+
+        return None


 def dispatch(type, db, dump):
@ -223,12 +258,12 @@ def dispatch(type, db, dump):
        if type is None:

            with io.open(dump) as fp:
-                peek = fp.read(2048)
+                peek = fp.read(io.DEFAULT_BUFFER_SIZE)

-            if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek:
+            if WordPress.detect(peek):
                type = "wordpress"

-            if '<disqus xmlns=' in peek:
+            if Disqus.detect(peek):
                type = "disqus"

        if type == "wordpress":
--- a/isso/tests/test_migration.py
+++ b/isso/tests/test_migration.py
@ -1,5 +1,7 @@
 # -*- encoding: utf-8 -*-

+from __future__ import unicode_literals
+
 try:
    import unittest2 as unittest
 except ImportError:
@ -49,7 +51,11 @@ class TestMigration(unittest.TestCase):
        self.assertEqual(db.threads["/2014/test/"]["title"], "Hello, World!")
        self.assertEqual(db.threads["/2014/test/"]["id"], 1)

-        self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 6)
+        self.assertEqual(db.threads["/?p=4"]["title"], "...")
+        self.assertEqual(db.threads["/?p=4"]["id"], 2)
+
+        self.assertEqual(len(db.execute("SELECT id FROM threads").fetchall()), 2)
+        self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 7)

        first = db.comments.get(1)
        self.assertEqual(first["author"], "Ohai")
@ -66,3 +72,24 @@ class TestMigration(unittest.TestCase):
        last = db.comments.get(6)
        self.assertEqual(last["author"], "Letzter :/")
        self.assertEqual(last["parent"], None)
+
+    def test_detection(self):
+
+        wp = """\
+                <?xml version="1.0" encoding="UTF-8"?>
+                <rss version="2.0"
+                    xmlns:content="http://purl.org/rss/1.0/modules/content/"
+                    xmlns:dc="http://purl.org/dc/elements/1.1/"
+                    xmlns:wp="http://wordpress.org/export/%s/">"""
+
+        self.assertEqual(WordPress.detect(wp % "invalid"), None)
+
+        for version in ("1.0", "1.1", "1.2", "1.3"):
+            self.assertEqual(WordPress.detect(wp % version),
+                             "http://wordpress.org/export/%s/" % version)
+
+        dq = '''\
+        <?xml version="1.0"?>
+        <disqus xmlns="http://disqus.com"
+                xmlns:dsq="http://disqus.com/disqus-internals"'''
+        self.assertIsNotNone(Disqus.detect(dq))
--- a/isso/tests/wordpress.xml
+++ b/isso/tests/wordpress.xml
@ -115,5 +115,30 @@
                <wp:comment_user_id>1</wp:comment_user_id>
            </wp:comment>
        </item>
+
+        <!-- handle ?p=X urls -->
+        <item>
+            <title>...</title>
+            <link>http://example.tld/?p=4</link>
+             <wp:comment>
+                <wp:comment_id>11</wp:comment_id>
+                <wp:comment_author><![CDATA[Anonymous]]></wp:comment_author>
+                <wp:comment_author_email>info@posativ.org
+                </wp:comment_author_email>
+                <wp:comment_author_url></wp:comment_author_url>
+                <wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
+                <wp:comment_date>2014-04-29 15:21:56</wp:comment_date>
+                <wp:comment_date_gmt>2014-04-29 15:21:57</wp:comment_date_gmt>
+                <wp:comment_content><![CDATA[...]]></wp:comment_content>
+                <wp:comment_approved>1</wp:comment_approved>
+                <wp:comment_type></wp:comment_type>
+                <wp:comment_parent>0</wp:comment_parent>
+                <wp:comment_user_id>1</wp:comment_user_id>
+            </wp:comment>
+        </item>
+        <item>
+            <title>No comments</title>
+            <link>http://example.tld/?p=6</link>
+        </item>
    </channel>
 </rss>