isso/contrib/import_blogger.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

"""Comment importer from Blogger

This python script can convert comments posted to a Blogger-powered blog to a
JSON file with can then be imported into Isso (by following the procedure
explained in docs/docs/extras/advanced-migration.rst.

The script can be run like this:

    python import_blogger.py -p 'http://myblog.com/' blogger.xml out.json

where `blogger.xml` is a dump of the blog produced by the Blogger platform, and
the URL following the `-p` option is a prefix that will be applied to all post
URLs: the original host will be stripped and the path will be appended to the
string you specify here (this can be useful in the case that your blog moved to
a different domain, subdomain, or just into a new directory).
The `out.json` file is the file which will be generated by this tool, and which
can then be fed into isso:

    isso -c /path/to/isso.cfg import -t generic out.json
"""

from __future__ import unicode_literals

import json

import feedparser
import time
from urllib.parse import urlparse


class Post:
    def __init__(self, url):
        self.url = url
        self.title = None
        self.comments = []

    def add_comment(self, comment):
        comment['id'] = len(self.comments) + 1
        self.comments.append(comment)


def encode_post(post):
    ret = {}
    ret['id'] = post.url
    ret['title'] = post.title
    ret['comments'] = post.comments
    return ret


class ImportBlogger:
    TYPE_COMMENT = 'http://schemas.google.com/blogger/2008/kind#comment'
    TYPE_POST = 'http://schemas.google.com/blogger/2008/kind#post'

    def __init__(self, filename_in, filename_out, prefix):
        self.channel = feedparser.parse(filename_in)
        self.filename_out = filename_out
        self.prefix = prefix

    def run(self):
        self.posts = {}
        for item in self.channel.entries:
            terms = [tag.term for tag in item.tags]
            if not terms:
                continue
            if terms[0] == self.TYPE_COMMENT:
                post = self.ensure_post(item)
                post.add_comment(self.process_comment(item))
            elif terms[0] == self.TYPE_POST:
                self.process_post(item)

        data = [encode_post(p) for p in self.posts.values() if p.comments]
        with open(self.filename_out, 'w') as fp:
            json.dump(data, fp, indent=2)

    def process_post(self, item):
        pid = self.post_id(item)
        if pid in self.posts:
            post = self.posts[pid]
        else:
            post = Post(pid)
            self.posts[pid] = post
        post.title = item.title

    def ensure_post(self, item):
        pid = self.post_id(item)
        post = self.posts.get(pid, None)
        if not post:
            post = Post(pid)
            self.posts[pid] = post
        return post

    def process_comment(self, item):
        comment = {}
        comment['author'] = item.author_detail.name
        comment['email'] = item.author_detail.email
        comment['website'] = item.author_detail.get('href', '')
        t = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed)
        comment['created'] = t
        comment['text'] = item.content[0].value
        comment['remote_addr'] = '127.0.0.1'
        return comment

    def post_id(self, item):
        u = urlparse(item.link)
        return self.prefix + u.path


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        description='Convert comments from blogger.com')
    parser.add_argument('input', help='input file')
    parser.add_argument('output', help='output file')
    parser.add_argument('-p', dest='prefix',
                        help='prefix to be added to paths (ID)',
                        type=str, default='')
    args = parser.parse_args()

    importer = ImportBlogger(args.input, args.output, args.prefix)
    importer.run()