contrib: Add Blogger importer tool (#529)

* contrib: Add Blogger importer tool * doc: fix minor issues in migration documentation
2019-10-13 20:55:17 +03:00 · 2019-10-13 20:55:17 +03:00 · c24ee69a1e
commit c24ee69a1e
parent f4b0376f1a
2 changed files with 125 additions and 2 deletions
--- a/contrib/import_blogger.py
+++ b/contrib/import_blogger.py
@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""Comment importer from Blogger
+
+This python script can convert comments posted to a Blogger-powered blog to a
+JSON file with can then be imported into Isso (by following the procedure
+explained in docs/docs/extras/advanced-migration.rst.
+
+The script can be run like this:
+
+    python import_blogger.py -p 'http://myblog.com/' blogger.xml out.json
+
+where `blogger.xml` is a dump of the blog produced by the Blogger platform, and
+the URL following the `-p` option is a prefix that will be applied to all post
+URLs: the original host will be stripped and the path will be appended to the
+string you specify here (this can be useful in the case that your blog moved to
+a different domain, subdomain, or just into a new directory).
+The `out.json` file is the file which will be generated by this tool, and which
+can then be fed into isso:
+
+    isso -c /path/to/isso.cfg import -t generic out.json
+"""
+
+from __future__ import unicode_literals
+
+import json
+
+import feedparser
+import time
+from urllib.parse import urlparse
+
+
+class Post:
+    def __init__(self, url):
+        self.url = url
+        self.title = None
+        self.comments = []
+
+    def add_comment(self, comment):
+        comment['id'] = len(self.comments) + 1
+        self.comments.append(comment)
+
+
+def encode_post(post):
+    ret = {}
+    ret['id'] = post.url
+    ret['title'] = post.title
+    ret['comments'] = post.comments
+    return ret
+
+
+class ImportBlogger:
+    TYPE_COMMENT = 'http://schemas.google.com/blogger/2008/kind#comment'
+    TYPE_POST = 'http://schemas.google.com/blogger/2008/kind#post'
+
+    def __init__(self, filename_in, filename_out, prefix):
+        self.channel = feedparser.parse(filename_in)
+        self.filename_out = filename_out
+        self.prefix = prefix
+
+    def run(self):
+        self.posts = {}
+        for item in self.channel.entries:
+            terms = [tag.term for tag in item.tags]
+            if not terms:
+                continue
+            if terms[0] == self.TYPE_COMMENT:
+                post = self.ensure_post(item)
+                post.add_comment(self.process_comment(item))
+            elif terms[0] == self.TYPE_POST:
+                self.process_post(item)
+
+        data = [encode_post(p) for p in self.posts.values() if p.comments]
+        with open(self.filename_out, 'w') as fp:
+            json.dump(data, fp, indent=2)
+
+    def process_post(self, item):
+        pid = self.post_id(item)
+        if pid in self.posts:
+            post = self.posts[pid]
+        else:
+            post = Post(pid)
+            self.posts[pid] = post
+        post.title = item.title
+
+    def ensure_post(self, item):
+        pid = self.post_id(item)
+        post = self.posts.get(pid, None)
+        if not post:
+            post = Post(pid)
+            self.posts[pid] = post
+        return post
+
+    def process_comment(self, item):
+        comment = {}
+        comment['author'] = item.author_detail.name
+        comment['email'] = item.author_detail.email
+        comment['website'] = item.author_detail.get('href', '')
+        t = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed)
+        comment['created'] = t
+        comment['text'] = item.content[0].value
+        comment['remote_addr'] = '127.0.0.1'
+        return comment
+
+    def post_id(self, item):
+        u = urlparse(item.link)
+        return self.prefix + u.path
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Convert comments from blogger.com')
+    parser.add_argument('input', help='input file')
+    parser.add_argument('output', help='output file')
+    parser.add_argument('-p', dest='prefix',
+                        help='prefix to be added to paths (ID)',
+                        type=str, default='')
+    args = parser.parse_args()
+
+    importer = ImportBlogger(args.input, args.output, args.prefix)
+    importer.run()
--- a/docs/docs/extras/advanced-migration.rst
+++ b/docs/docs/extras/advanced-migration.rst
@ -35,8 +35,8 @@ Example:
    [
        {
            "id": "/blog/article1",
-            "title": "First article!"
-            comments": [
+            "title": "First article!",
+            "comments": [
                {
                    "author": "James",
                    "created": "2018-11-28 17:24:23",