isso/contrib/import_blogger.py

124 lines
3.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""Comment importer from Blogger
This python script can convert comments posted to a Blogger-powered blog to a
JSON file with can then be imported into Isso (by following the procedure
explained in docs/docs/extras/advanced-migration.rst.
The script can be run like this:
python import_blogger.py -p 'http://myblog.com/' blogger.xml out.json
where `blogger.xml` is a dump of the blog produced by the Blogger platform, and
the URL following the `-p` option is a prefix that will be applied to all post
URLs: the original host will be stripped and the path will be appended to the
string you specify here (this can be useful in the case that your blog moved to
a different domain, subdomain, or just into a new directory).
The `out.json` file is the file which will be generated by this tool, and which
can then be fed into isso:
isso -c /path/to/isso.cfg import -t generic out.json
"""
from __future__ import unicode_literals
import json
import feedparser
import time
from urllib.parse import urlparse
class Post:
def __init__(self, url):
self.url = url
self.title = None
self.comments = []
def add_comment(self, comment):
comment['id'] = len(self.comments) + 1
self.comments.append(comment)
def encode_post(post):
ret = {}
ret['id'] = post.url
ret['title'] = post.title
ret['comments'] = post.comments
return ret
class ImportBlogger:
TYPE_COMMENT = 'http://schemas.google.com/blogger/2008/kind#comment'
TYPE_POST = 'http://schemas.google.com/blogger/2008/kind#post'
def __init__(self, filename_in, filename_out, prefix):
self.channel = feedparser.parse(filename_in)
self.filename_out = filename_out
self.prefix = prefix
def run(self):
self.posts = {}
for item in self.channel.entries:
terms = [tag.term for tag in item.tags]
if not terms:
continue
if terms[0] == self.TYPE_COMMENT:
post = self.ensure_post(item)
post.add_comment(self.process_comment(item))
elif terms[0] == self.TYPE_POST:
self.process_post(item)
data = [encode_post(p) for p in self.posts.values() if p.comments]
with open(self.filename_out, 'w') as fp:
json.dump(data, fp, indent=2)
def process_post(self, item):
pid = self.post_id(item)
if pid in self.posts:
post = self.posts[pid]
else:
post = Post(pid)
self.posts[pid] = post
post.title = item.title
def ensure_post(self, item):
pid = self.post_id(item)
post = self.posts.get(pid, None)
if not post:
post = Post(pid)
self.posts[pid] = post
return post
def process_comment(self, item):
comment = {}
comment['author'] = item.author_detail.name
comment['email'] = item.author_detail.email
comment['website'] = item.author_detail.get('href', '')
t = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed)
comment['created'] = t
comment['text'] = item.content[0].value
comment['remote_addr'] = '127.0.0.1'
return comment
def post_id(self, item):
u = urlparse(item.link)
return self.prefix + u.path
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Convert comments from blogger.com')
parser.add_argument('input', help='input file')
parser.add_argument('output', help='output file')
parser.add_argument('-p', dest='prefix',
help='prefix to be added to paths (ID)',
type=str, default='')
args = parser.parse_args()
importer = ImportBlogger(args.input, args.output, args.prefix)
importer.run()