From 123ea26ca902bcaf8a290b1d130841b946f25ccc Mon Sep 17 00:00:00 2001
From: Martin Zimmermann <info@posativ.org>
Date: Fri, 2 May 2014 13:06:06 +0200
Subject: [PATCH] handle WP's query-string "pages" and variable WXR namespaces

Site links such as /?p=1234 are imported *as is* and maybe do work in
Isso. Do not use a query-based URL structure as permalinks. Ever.

Also, depending on the pages you are going to export, WXR' XML namespace
may change from ../export/1.0/ to ../export/1.2/. Isso tries to import
any WXR 1.x
---
 docs/docs/quickstart.rst     |  3 +-
 isso/migrate.py              | 69 +++++++++++++++++++++++++++---------
 isso/tests/test_migration.py | 29 ++++++++++++++-
 isso/tests/wordpress.xml     | 25 +++++++++++++
 4 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/docs/docs/quickstart.rst b/docs/docs/quickstart.rst
index e497b80..8fa9b0f 100644
--- a/docs/docs/quickstart.rst
+++ b/docs/docs/quickstart.rst
@@ -68,7 +68,8 @@ on *Discussions* and select the *Export* tab. You'll receive an email with your
 comments. Unfortunately, Disqus does not export up- and downvotes.
 
 To export comments from your previous WordPress installation, go to *Tools*,
-export your data.
+export your data. WordPress WXR import is quite new and may not work for you;
+please report any failures.
 
 Now import the XML dump:
 
diff --git a/isso/migrate.py b/isso/migrate.py
index a2089ff..ed9fe1d 100644
--- a/isso/migrate.py
+++ b/isso/migrate.py
@@ -5,6 +5,7 @@ from __future__ import division, print_function
 import sys
 import os
 import io
+import re
 import textwrap
 
 from time import mktime, strptime, time
@@ -145,6 +146,14 @@ class Disqus(object):
                                     initial_indent="  ", subsequent_indent="  "))
                 print("")
 
+    @classmethod
+    def detect(cls, peek):
+
+        if 'xmlns="http://disqus.com' in peek:
+            return "http://disqus.com"
+
+        return None
+
 
 class WordPress(object):
 
@@ -155,12 +164,23 @@ class WordPress(object):
         self.xmlfile = xmlfile
         self.count = 0
 
+        with io.open(xmlfile) as fp:
+            ns = WordPress.detect(fp.read(io.DEFAULT_BUFFER_SIZE))
+
+        if ns:
+            self.ns = "{" + ns + "}"
+
     def insert(self, thread):
 
-        path = urlparse(thread.find("link").text).path
+        url = urlparse(thread.find("link").text)
+        path = url.path
+
+        if url.query:
+            path += "?" + url.query
+
         self.db.threads.new(path, thread.find("title").text.strip())
 
-        comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment")))
+        comments = list(map(self.Comment, thread.findall(self.ns + "comment")))
         comments.sort(key=lambda k: k["id"])
 
         remap = {}
@@ -188,32 +208,47 @@ class WordPress(object):
     def migrate(self):
 
         tree = ElementTree.parse(self.xmlfile)
+
+        skip = 0
         items = tree.findall("channel/item")
 
         progress = Progress(len(items))
         for i, thread in enumerate(items):
+            if thread.find("title").text is None or thread.find(self.ns + "comment") is None:
+                skip += 1
+                continue
+
             progress.update(i, thread.find("title").text)
             self.insert(thread)
 
-        progress.finish("{0} threads, {1} comments".format(len(items), self.count))
+        progress.finish("{0} threads, {1} comments".format(
+            len(items) - skip, self.count))
 
-    @classmethod
-    def Comment(cls, el):
+    def Comment(self, el):
         return {
-            "text": strip(el.find(WordPress.ns + "comment_content").text),
-            "author": strip(el.find(WordPress.ns + "comment_author").text),
-            "email": strip(el.find(WordPress.ns + "comment_author_email").text),
-            "website": strip(el.find(WordPress.ns + "comment_author_url").text),
+            "text": strip(el.find(self.ns + "comment_content").text),
+            "author": strip(el.find(self.ns + "comment_author").text),
+            "email": strip(el.find(self.ns + "comment_author_email").text),
+            "website": strip(el.find(self.ns + "comment_author_url").text),
             "remote_addr": anonymize(
-                strip(el.find(WordPress.ns + "comment_author_IP").text)),
+                strip(el.find(self.ns + "comment_author_IP").text)),
             "created": mktime(strptime(
-                strip(el.find(WordPress.ns + "comment_date_gmt").text),
+                strip(el.find(self.ns + "comment_date_gmt").text),
                 "%Y-%m-%d %H:%M:%S")),
-            "mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2,
-            "id": int(el.find(WordPress.ns + "comment_id").text),
-            "parent": int(el.find(WordPress.ns + "comment_parent").text) or None
+            "mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2,
+            "id": int(el.find(self.ns + "comment_id").text),
+            "parent": int(el.find(self.ns + "comment_parent").text) or None
         }
 
+    @classmethod
+    def detect(cls, peek):
+
+        m = re.search("http://wordpress.org/export/1\.\d/", peek)
+        if m:
+            return m.group(0)
+
+        return None
+
 
 def dispatch(type, db, dump):
         if db.execute("SELECT * FROM comments").fetchone():
@@ -223,12 +258,12 @@ def dispatch(type, db, dump):
         if type is None:
 
             with io.open(dump) as fp:
-                peek = fp.read(2048)
+                peek = fp.read(io.DEFAULT_BUFFER_SIZE)
 
-            if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek:
+            if WordPress.detect(peek):
                 type = "wordpress"
 
-            if '<disqus xmlns=' in peek:
+            if Disqus.detect(peek):
                 type = "disqus"
 
         if type == "wordpress":
diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py
index 9634ae3..88e3a1c 100644
--- a/isso/tests/test_migration.py
+++ b/isso/tests/test_migration.py
@@ -1,5 +1,7 @@
 # -*- encoding: utf-8 -*-
 
+from __future__ import unicode_literals
+
 try:
     import unittest2 as unittest
 except ImportError:
@@ -49,7 +51,11 @@ class TestMigration(unittest.TestCase):
         self.assertEqual(db.threads["/2014/test/"]["title"], "Hello, World!")
         self.assertEqual(db.threads["/2014/test/"]["id"], 1)
 
-        self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 6)
+        self.assertEqual(db.threads["/?p=4"]["title"], "...")
+        self.assertEqual(db.threads["/?p=4"]["id"], 2)
+
+        self.assertEqual(len(db.execute("SELECT id FROM threads").fetchall()), 2)
+        self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 7)
 
         first = db.comments.get(1)
         self.assertEqual(first["author"], "Ohai")
@@ -66,3 +72,24 @@ class TestMigration(unittest.TestCase):
         last = db.comments.get(6)
         self.assertEqual(last["author"], "Letzter :/")
         self.assertEqual(last["parent"], None)
+
+    def test_detection(self):
+
+        wp = """\
+                <?xml version="1.0" encoding="UTF-8"?>
+                <rss version="2.0"
+                    xmlns:content="http://purl.org/rss/1.0/modules/content/"
+                    xmlns:dc="http://purl.org/dc/elements/1.1/"
+                    xmlns:wp="http://wordpress.org/export/%s/">"""
+
+        self.assertEqual(WordPress.detect(wp % "invalid"), None)
+
+        for version in ("1.0", "1.1", "1.2", "1.3"):
+            self.assertEqual(WordPress.detect(wp % version),
+                             "http://wordpress.org/export/%s/" % version)
+
+        dq = '''\
+        <?xml version="1.0"?>
+        <disqus xmlns="http://disqus.com"
+                xmlns:dsq="http://disqus.com/disqus-internals"'''
+        self.assertIsNotNone(Disqus.detect(dq))
diff --git a/isso/tests/wordpress.xml b/isso/tests/wordpress.xml
index 012795f..bf1bb8a 100644
--- a/isso/tests/wordpress.xml
+++ b/isso/tests/wordpress.xml
@@ -115,5 +115,30 @@
                 <wp:comment_user_id>1</wp:comment_user_id>
             </wp:comment>
         </item>
+
+        <!-- handle ?p=X urls -->
+        <item>
+            <title>...</title>
+            <link>http://example.tld/?p=4</link>
+             <wp:comment>
+                <wp:comment_id>11</wp:comment_id>
+                <wp:comment_author><![CDATA[Anonymous]]></wp:comment_author>
+                <wp:comment_author_email>info@posativ.org
+                </wp:comment_author_email>
+                <wp:comment_author_url></wp:comment_author_url>
+                <wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
+                <wp:comment_date>2014-04-29 15:21:56</wp:comment_date>
+                <wp:comment_date_gmt>2014-04-29 15:21:57</wp:comment_date_gmt>
+                <wp:comment_content><![CDATA[...]]></wp:comment_content>
+                <wp:comment_approved>1</wp:comment_approved>
+                <wp:comment_type></wp:comment_type>
+                <wp:comment_parent>0</wp:comment_parent>
+                <wp:comment_user_id>1</wp:comment_user_id>
+            </wp:comment>
+        </item>
+        <item>
+            <title>No comments</title>
+            <link>http://example.tld/?p=6</link>
+        </item>
     </channel>
 </rss>
\ No newline at end of file