124 lines
3.8 KiB
Python
124 lines
3.8 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- encoding: utf-8 -*-
|
||
|
|
||
|
"""Comment importer from Blogger
|
||
|
|
||
|
This python script can convert comments posted to a Blogger-powered blog to a
|
||
|
JSON file with can then be imported into Isso (by following the procedure
|
||
|
explained in docs/docs/extras/advanced-migration.rst.
|
||
|
|
||
|
The script can be run like this:
|
||
|
|
||
|
python import_blogger.py -p 'http://myblog.com/' blogger.xml out.json
|
||
|
|
||
|
where `blogger.xml` is a dump of the blog produced by the Blogger platform, and
|
||
|
the URL following the `-p` option is a prefix that will be applied to all post
|
||
|
URLs: the original host will be stripped and the path will be appended to the
|
||
|
string you specify here (this can be useful in the case that your blog moved to
|
||
|
a different domain, subdomain, or just into a new directory).
|
||
|
The `out.json` file is the file which will be generated by this tool, and which
|
||
|
can then be fed into isso:
|
||
|
|
||
|
isso -c /path/to/isso.cfg import -t generic out.json
|
||
|
"""
|
||
|
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import json
|
||
|
|
||
|
import feedparser
|
||
|
import time
|
||
|
from urllib.parse import urlparse
|
||
|
|
||
|
|
||
|
class Post:
|
||
|
def __init__(self, url):
|
||
|
self.url = url
|
||
|
self.title = None
|
||
|
self.comments = []
|
||
|
|
||
|
def add_comment(self, comment):
|
||
|
comment['id'] = len(self.comments) + 1
|
||
|
self.comments.append(comment)
|
||
|
|
||
|
|
||
|
def encode_post(post):
|
||
|
ret = {}
|
||
|
ret['id'] = post.url
|
||
|
ret['title'] = post.title
|
||
|
ret['comments'] = post.comments
|
||
|
return ret
|
||
|
|
||
|
|
||
|
class ImportBlogger:
|
||
|
TYPE_COMMENT = 'http://schemas.google.com/blogger/2008/kind#comment'
|
||
|
TYPE_POST = 'http://schemas.google.com/blogger/2008/kind#post'
|
||
|
|
||
|
def __init__(self, filename_in, filename_out, prefix):
|
||
|
self.channel = feedparser.parse(filename_in)
|
||
|
self.filename_out = filename_out
|
||
|
self.prefix = prefix
|
||
|
|
||
|
def run(self):
|
||
|
self.posts = {}
|
||
|
for item in self.channel.entries:
|
||
|
terms = [tag.term for tag in item.tags]
|
||
|
if not terms:
|
||
|
continue
|
||
|
if terms[0] == self.TYPE_COMMENT:
|
||
|
post = self.ensure_post(item)
|
||
|
post.add_comment(self.process_comment(item))
|
||
|
elif terms[0] == self.TYPE_POST:
|
||
|
self.process_post(item)
|
||
|
|
||
|
data = [encode_post(p) for p in self.posts.values() if p.comments]
|
||
|
with open(self.filename_out, 'w') as fp:
|
||
|
json.dump(data, fp, indent=2)
|
||
|
|
||
|
def process_post(self, item):
|
||
|
pid = self.post_id(item)
|
||
|
if pid in self.posts:
|
||
|
post = self.posts[pid]
|
||
|
else:
|
||
|
post = Post(pid)
|
||
|
self.posts[pid] = post
|
||
|
post.title = item.title
|
||
|
|
||
|
def ensure_post(self, item):
|
||
|
pid = self.post_id(item)
|
||
|
post = self.posts.get(pid, None)
|
||
|
if not post:
|
||
|
post = Post(pid)
|
||
|
self.posts[pid] = post
|
||
|
return post
|
||
|
|
||
|
def process_comment(self, item):
|
||
|
comment = {}
|
||
|
comment['author'] = item.author_detail.name
|
||
|
comment['email'] = item.author_detail.email
|
||
|
comment['website'] = item.author_detail.get('href', '')
|
||
|
t = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed)
|
||
|
comment['created'] = t
|
||
|
comment['text'] = item.content[0].value
|
||
|
comment['remote_addr'] = '127.0.0.1'
|
||
|
return comment
|
||
|
|
||
|
def post_id(self, item):
|
||
|
u = urlparse(item.link)
|
||
|
return self.prefix + u.path
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import argparse
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description='Convert comments from blogger.com')
|
||
|
parser.add_argument('input', help='input file')
|
||
|
parser.add_argument('output', help='output file')
|
||
|
parser.add_argument('-p', dest='prefix',
|
||
|
help='prefix to be added to paths (ID)',
|
||
|
type=str, default='')
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
importer = ImportBlogger(args.input, args.output, args.prefix)
|
||
|
importer.run()
|