add initial support to import WordPress comments

This commit is contained in:
Martin Zimmermann 2014-04-30 18:46:44 +02:00
parent 0154113c80
commit 12f8af8434
4 changed files with 274 additions and 12 deletions

View File

@ -207,6 +207,8 @@ def main():
imprt.add_argument("dump", metavar="FILE")
imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true",
help="perform a trial run with no changes made")
imprt.add_argument("-t", "--type", dest="type", default=None,
choices=["disqus", "wordpress"], help="export type")
serve = subparser.add_parser("run", help="run server")
@ -223,7 +225,7 @@ def main():
dbpath = conf.get("general", "dbpath")
mydb = db.SQLite3(dbpath, conf)
migrate.dispatch(mydb, args.dump)
migrate.dispatch(args.type, mydb, args.dump)
sys.exit(0)

View File

@ -1,14 +1,18 @@
# -*- encoding: utf-8 -*-
from __future__ import division
from __future__ import division, print_function
import sys
import os
import io
import textwrap
from time import mktime, strptime
from time import mktime, strptime, time
from collections import defaultdict
from isso.utils import anonymize
from isso.compat import string_types
try:
input = raw_input
except NameError:
@ -22,6 +26,39 @@ except ImportError:
from xml.etree import ElementTree
def strip(val):
if isinstance(val, string_types):
return val.strip()
return val
class Progress(object):
def __init__(self, end):
self.end = end or 1
self.istty = sys.stdout.isatty()
self.last = 0
def update(self, i, message):
if not self.istty or message is None:
return
cols = int((os.popen('stty size', 'r').read()).split()[1])
message = message[:cols - 7]
if time() - self.last > 0.2:
sys.stdout.write("\r{0}".format(" " * cols))
sys.stdout.write("\r[{0:.0%}] {1}".format(i/self.end, message))
sys.stdout.flush()
self.last = time()
def finish(self, message):
self.last = 0
self.update(self.end, message + "\n")
class Disqus(object):
ns = '{http://disqus.com}'
@ -116,9 +153,94 @@ class Disqus(object):
print("")
def dispatch(db, dump):
class WordPress(object):
ns = "{http://wordpress.org/export/1.0/}"
def __init__(self, db, xmlfile):
self.db = db
self.xmlfile = xmlfile
self.count = 0
def insert(self, thread):
path = urlparse(thread.find("link").text).path
self.db.threads.new(path, thread.find("title").text.strip())
comments = list(map(WordPress.Comment, thread.findall(WordPress.ns + "comment")))
comments.sort(key=lambda k: k["id"])
remap = {}
ids = set(c["id"] for c in comments)
self.count += len(ids)
while comments:
for i, item in enumerate(comments):
if item["parent"] in ids:
continue
item["parent"] = remap.get(item["parent"], None)
rv = self.db.comments.add(path, item)
remap[item["id"]] = rv["id"]
ids.remove(item["id"])
comments.pop(i)
break
else:
# should never happen, but... it's WordPress.
return
def migrate(self):
tree = ElementTree.parse(self.xmlfile)
items = tree.findall("channel/item")
progress = Progress(len(items))
for i, thread in enumerate(items):
progress.update(i, thread.find("title").text)
self.insert(thread)
progress.finish("{0} threads, {1} comments".format(len(items), self.count))
@classmethod
def Comment(cls, el):
return {
"text": strip(el.find(WordPress.ns + "comment_content").text),
"author": strip(el.find(WordPress.ns + "comment_author").text),
"email": strip(el.find(WordPress.ns + "comment_author_email").text),
"website": strip(el.find(WordPress.ns + "comment_author_url").text),
"remote_addr": anonymize(
strip(el.find(WordPress.ns + "comment_author_IP").text)),
"created": mktime(strptime(
strip(el.find(WordPress.ns + "comment_date_gmt").text),
"%Y-%m-%d %H:%M:%S")),
"mode": 1 if el.find(WordPress.ns + "comment_approved").text == "1" else 2,
"id": int(el.find(WordPress.ns + "comment_id").text),
"parent": int(el.find(WordPress.ns + "comment_parent").text) or None
}
def dispatch(type, db, dump):
if db.execute("SELECT * FROM comments").fetchone():
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
Disqus(db, dump).migrate()
if type is None:
with io.open(dump) as fp:
peek = fp.read(2048)
if 'xmlns:wp="%s"' % WordPress.ns[1:-1] in peek:
type = "wordpress"
if '<disqus xmlns=' in peek:
type = "disqus"
if type == "wordpress":
WordPress(db, dump).migrate()
elif type == "disqus":
Disqus(db, dump).migrate()
else:
raise SystemExit("Unknown format, abort.")

View File

@ -11,7 +11,7 @@ from os.path import join, dirname
from isso.core import Config
from isso.db import SQLite3
from isso.migrate import Disqus
from isso.migrate import Disqus, WordPress
class TestMigration(unittest.TestCase):
@ -33,13 +33,32 @@ class TestMigration(unittest.TestCase):
self.assertEqual(a["email"], "foo@bar.com")
b = db.comments.get(2)
self.assertEqual(b["parent"] ,a["id"])
self.assertEqual(b["parent"], a["id"])
def test_wordpress(self):
a = db.comments.get(1)
xml = join(dirname(__file__), "wordpress.xml")
xxx = tempfile.NamedTemporaryFile()
assert a["author"] == "peter"
assert a["email"] == "foo@bar.com"
db = SQLite3(xxx.name, Config.load(None))
WordPress(db, xml).migrate()
b = db.comments.get(2)
assert b["parent"] == a["id"]
self.assertEqual(db.threads["/2014/test/"]["title"], "Hello, World!")
self.assertEqual(db.threads["/2014/test/"]["id"], 1)
self.assertEqual(len(db.execute("SELECT id FROM comments").fetchall()), 6)
first = db.comments.get(1)
self.assertEqual(first["author"], "Ohai")
self.assertEqual(first["text"], "Erster!1")
second = db.comments.get(2)
self.assertEqual(second["author"], "Tester")
self.assertEqual(second["text"], "Zweiter.")
for i in (3, 4, 5):
self.assertEqual(db.comments.get(i)["parent"], second["id"])
last = db.comments.get(6)
self.assertEqual(last["author"], "Letzter :/")
self.assertEqual(last["parent"], None)

119
isso/tests/wordpress.xml Normal file
View File

@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.0/">
<!-- This WXR dump is incomplete! It only contains elements needed for an
import, a few are unused yet, but eventually useful later.
The <item> node is derived from a sort-of real-world WordPress blog,
but modified to test various things.
-->
<channel>
<item>
<title>Hello, World!</title>
<link>http://example.tld/2014/test/</link>
<pubDate>Tue, 14 Jan 2014 17:31:03 +0000</pubDate>
<dc:creator><![CDATA[Tester]]></dc:creator>
<wp:post_id>18</wp:post_id>
<wp:post_date>2014-01-14 17:31:03</wp:post_date>
<wp:post_date_gmt>2014-01-14 17:31:03</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:post_name>test</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_type>post</wp:post_type>
<wp:comment>
<wp:comment_id>2</wp:comment_id>
<wp:comment_author><![CDATA[Ohai]]></wp:comment_author>
<wp:comment_author_email>test@example.org
</wp:comment_author_email>
<wp:comment_author_url>http://example.tld/</wp:comment_author_url>
<wp:comment_author_IP>::ffff:82.119.20.0</wp:comment_author_IP>
<wp:comment_date>2014-01-14 17:32:12</wp:comment_date>
<wp:comment_date_gmt>2014-01-14 17:32:12</wp:comment_date_gmt>
<wp:comment_content>
<![CDATA[Erster!1]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<!-- what's that? -->
<wp:comment_type></wp:comment_type>
<wp:comment_parent>0</wp:comment_parent>
<wp:comment_user_id>0</wp:comment_user_id>
</wp:comment>
<wp:comment>
<wp:comment_id>6</wp:comment_id>
<wp:comment_author><![CDATA[Tester]]></wp:comment_author>
<wp:comment_author_email>info@posativ.org
</wp:comment_author_email>
<wp:comment_author_url></wp:comment_author_url>
<wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
<wp:comment_date>2014-04-29 15:21:27</wp:comment_date>
<wp:comment_date_gmt>2014-04-29 15:21:27</wp:comment_date_gmt>
<wp:comment_content><![CDATA[Zweiter.]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_type></wp:comment_type>
<wp:comment_parent>0</wp:comment_parent>
<wp:comment_user_id>1</wp:comment_user_id>
</wp:comment>
<wp:comment>
<wp:comment_id>7</wp:comment_id>
<wp:comment_author><![CDATA[Tester]]></wp:comment_author>
<wp:comment_author_email>info@posativ.org
</wp:comment_author_email>
<wp:comment_author_url></wp:comment_author_url>
<wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
<wp:comment_date_gmt>2014-04-29 15:21:35</wp:comment_date_gmt>
<wp:comment_content><![CDATA[Drölfter!]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_parent>6</wp:comment_parent>
</wp:comment>
<wp:comment>
<wp:comment_id>8</wp:comment_id>
<wp:comment_author><![CDATA[Tester]]></wp:comment_author>
<wp:comment_author_email>info@posativ.org
</wp:comment_author_email>
<wp:comment_author_url></wp:comment_author_url>
<wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
<wp:comment_date>2014-04-29 15:21:45</wp:comment_date>
<wp:comment_date_gmt>2014-04-29 15:21:45</wp:comment_date_gmt>
<wp:comment_content>
<![CDATA[Yet another reply.]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_type></wp:comment_type>
<wp:comment_parent>7</wp:comment_parent>
<wp:comment_user_id>1</wp:comment_user_id>
</wp:comment>
<wp:comment>
<wp:comment_id>9</wp:comment_id>
<wp:comment_author><![CDATA[Tester]]></wp:comment_author>
<wp:comment_author_email>info@posativ.org
</wp:comment_author_email>
<wp:comment_author_url></wp:comment_author_url>
<wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
<wp:comment_date>2014-04-29 15:21:52</wp:comment_date>
<wp:comment_date_gmt>2014-04-29 15:21:52</wp:comment_date_gmt>
<wp:comment_content><![CDATA[...]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_type></wp:comment_type>
<wp:comment_parent>7</wp:comment_parent>
<wp:comment_user_id>1</wp:comment_user_id>
</wp:comment>
<wp:comment>
<wp:comment_id>10</wp:comment_id>
<wp:comment_author><![CDATA[Letzter :/]]></wp:comment_author>
<wp:comment_author_email>info@posativ.org
</wp:comment_author_email>
<wp:comment_author_url></wp:comment_author_url>
<wp:comment_author_IP>::ffff:86.56.63.0</wp:comment_author_IP>
<wp:comment_date>2014-04-29 15:21:56</wp:comment_date>
<wp:comment_date_gmt>2014-04-29 15:21:56</wp:comment_date_gmt>
<wp:comment_content><![CDATA[...]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_type></wp:comment_type>
<wp:comment_parent>0</wp:comment_parent>
<wp:comment_user_id>1</wp:comment_user_id>
</wp:comment>
</item>
</channel>
</rss>