From 123ea26ca902bcaf8a290b1d130841b946f25ccc Mon Sep 17 00:00:00 2001 From: Martin Zimmermann Date: Fri, 2 May 2014 13:06:06 +0200 Subject: [PATCH] handle WP's query-string "pages" and variable WXR namespaces Site links such as /?p=1234 are imported *as is* and maybe do work in Isso. Do not use a query-based URL structure as permalinks. Ever. Also, depending on the pages you are going to export, WXR' XML namespace may change from ../export/1.0/ to ../export/1.2/. Isso tries to import any WXR 1.x --- docs/docs/quickstart.rst | 3 +- isso/migrate.py | 69 +++++++++++++++++++++++++++--------- isso/tests/test_migration.py | 29 ++++++++++++++- isso/tests/wordpress.xml | 25 +++++++++++++ 4 files changed, 107 insertions(+), 19 deletions(-) diff --git a/docs/docs/quickstart.rst b/docs/docs/quickstart.rst index e497b80..8fa9b0f 100644 --- a/docs/docs/quickstart.rst +++ b/docs/docs/quickstart.rst @@ -68,7 +68,8 @@ on *Discussions* and select the *Export* tab. You'll receive an email with your comments. Unfortunately, Disqus does not export up- and downvotes. To export comments from your previous WordPress installation, go to *Tools*, -export your data. +export your data. WordPress WXR import is quite new and may not work for you; +please report any failures. Now import the XML dump: diff --git a/isso/migrate.py b/isso/migrate.py index a2089ff..ed9fe1d 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -5,6 +5,7 @@ from __future__ import division, print_function import sys import os import io +import re import textwrap from time import mktime, strptime, time @@ -145,6 +146,14 @@ class Disqus(object): initial_indent=" ", subsequent_indent=" ")) print("") + @classmethod + def detect(cls, peek): + + if 'xmlns="http://disqus.com' in peek: + return "http://disqus.com" + + return None + class WordPress(object): @@ -155,12 +164,23 @@ class WordPress(object): self.xmlfile = xmlfile self.count = 0 + with io.open(xmlfile) as fp: + ns = WordPress.detect(fp.read(io.DEFAULT_BUFFER_SIZE)) + + if ns: + self.ns = "{" + ns + "}" + def insert(self, thread): - path = urlparse(thread.find("link").text).path + url = urlparse(thread.find("link").text) + path = url.path + + if url.query: + path += "?" + url.query + self.db.threads.new(path, thread.find("title").text.strip()) - comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment"))) + comments = list(map(self.Comment, thread.findall(self.ns + "comment"))) comments.sort(key=lambda k: k["id"]) remap = {} @@ -188,32 +208,47 @@ class WordPress(object): def migrate(self): tree = ElementTree.parse(self.xmlfile) + + skip = 0 items = tree.findall("channel/item") progress = Progress(len(items)) for i, thread in enumerate(items): + if thread.find("title").text is None or thread.find(self.ns + "comment") is None: + skip += 1 + continue + progress.update(i, thread.find("title").text) self.insert(thread) - progress.finish("{0} threads, {1} comments".format(len(items), self.count)) + progress.finish("{0} threads, {1} comments".format( + len(items) - skip, self.count)) - @classmethod - def Comment(cls, el): + def Comment(self, el): return { - "text": strip(el.find(WordPress.ns + "comment_content").text), - "author": strip(el.find(WordPress.ns + "comment_author").text), - "email": strip(el.find(WordPress.ns + "comment_author_email").text), - "website": strip(el.find(WordPress.ns + "comment_author_url").text), + "text": strip(el.find(self.ns + "comment_content").text), + "author": strip(el.find(self.ns + "comment_author").text), + "email": strip(el.find(self.ns + "comment_author_email").text), + "website": strip(el.find(self.ns + "comment_author_url").text), "remote_addr": anonymize( - strip(el.find(WordPress.ns + "comment_author_IP").text)), + strip(el.find(self.ns + "comment_author_IP").text)), "created": mktime(strptime( - strip(el.find(WordPress.ns + "comment_date_gmt").text), + strip(el.find(self.ns + "comment_date_gmt").text), "%Y-%m-%d %H:%M:%S")), - "mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2, - "id": int(el.find(WordPress.ns + "comment_id").text), - "parent": int(el.find(WordPress.ns + "comment_parent").text) or None + "mode": 1 if el.find(self.ns + "comment_approved").text == "1" else 2, + "id": int(el.find(self.ns + "comment_id").text), + "parent": int(el.find(self.ns + "comment_parent").text) or None } + @classmethod + def detect(cls, peek): + + m = re.search("http://wordpress.org/export/1\.\d/", peek) + if m: + return m.group(0) + + return None + def dispatch(type, db, dump): if db.execute("SELECT * FROM comments").fetchone(): @@ -223,12 +258,12 @@ def dispatch(type, db, dump): if type is None: with io.open(dump) as fp: - peek = fp.read(2048) + peek = fp.read(io.DEFAULT_BUFFER_SIZE) - if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek: + if WordPress.detect(peek): type = "wordpress" - if ' + 1 + + + + ... + http://example.tld/?p=4 + + 11 + + info@posativ.org + + + ::ffff:86.56.63.0 + 2014-04-29 15:21:56 + 2014-04-29 15:21:57 + + 1 + + 0 + 1 + + + + No comments + http://example.tld/?p=6 + \ No newline at end of file