wynaut import

This commit is contained in:
Martin Zimmermann 2013-12-12 12:24:37 +01:00
parent 85e637d017
commit 770dbf48af
6 changed files with 204 additions and 133 deletions

View File

@ -62,7 +62,7 @@ from werkzeug.contrib.profiler import ProfilerMiddleware
local = Local()
local_manager = LocalManager([local])
from isso import db, migrate, wsgi, ext, views
from isso import db, wsgi, ext, views
from isso.core import ThreadedMixin, ProcessMixin, uWSGIMixin, Config
from isso.utils import parse, http, JSONRequest, origin
from isso.views import comments
@ -194,24 +194,11 @@ def main():
parser.add_argument("-c", dest="conf", default="/etc/isso.conf",
metavar="/etc/isso.conf", help="set configuration file")
imprt = subparser.add_parser('import', help="import Disqus XML export")
imprt.add_argument("dump", metavar="FILE")
imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true",
help="perform a trial run with no changes made")
serve = subparser.add_parser("run", help="run server")
args = parser.parse_args()
conf = Config.load(args.conf)
if args.command == "import":
xxx = tempfile.NamedTemporaryFile()
dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name
conf.set("guard", "enabled", "off")
migrate.disqus(db.SQLite3(dbpath, conf), args.dump)
sys.exit(0)
if conf.get("server", "listen").startswith("http://"):
host, port, _ = parse.host(conf.get("server", "listen"))
try:

View File

@ -1,114 +0,0 @@
# -*- encoding: utf-8 -*-
from __future__ import division
import sys
import os
import textwrap
from time import mktime, strptime
from collections import defaultdict
try:
input = raw_input
except NameError:
pass
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
from xml.etree import ElementTree
ns = '{http://disqus.com}'
dsq = '{http://disqus.com/disqus-internals}'
threads = set([])
comments = set([])
def insert(db, thread, posts):
path = urlparse(thread.find('%slink' % ns).text).path
remap = dict()
if path not in db.threads:
db.threads.new(path, thread.find('%stitle' % ns).text.strip())
for item in sorted(posts, key=lambda k: k['created']):
dsq_id = item.pop('dsq:id')
item['parent'] = remap.get(item.pop('dsq:parent', None))
rv = db.comments.add(path, item)
remap[dsq_id] = rv["id"]
comments.update(set(remap.keys()))
def disqus(db, xmlfile):
if db.execute("SELECT * FROM comments").fetchone():
if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
tree = ElementTree.parse(xmlfile)
res = defaultdict(list)
for post in tree.findall('%spost' % ns):
item = {
'dsq:id': post.attrib.get(dsq + 'id'),
'text': post.find('%smessage' % ns).text,
'author': post.find('%sauthor/%sname' % (ns, ns)).text,
'email': post.find('%sauthor/%semail' % (ns, ns)).text,
'created': mktime(strptime(
post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')),
'remote_addr': '127.0.0.0',
'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4
}
if post.find(ns + 'parent') is not None:
item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id')
res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item)
num = len(tree.findall('%sthread' % ns))
cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1])
for i, thread in enumerate(tree.findall('%sthread' % ns)):
if int(round((i+1)/num, 2) * 100) % 13 == 0:
sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text))
sys.stdout.flush()
# skip (possibly?) duplicate, but empty thread elements
if thread.find('%sid' % ns).text is None:
continue
id = thread.attrib.get(dsq + 'id')
if id in res:
threads.add(id)
insert(db, thread, res[id])
# in case a comment has been deleted (and no further childs)
db.comments._remove_stale()
sys.stdout.write("\r%s" % (" "*cols))
sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments)))
orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments
if orphans:
print("Found %i orphans:" % len(orphans))
for post in tree.findall("%spost" % ns):
if post.attrib.get(dsq + "id") not in orphans:
continue
print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"),
post.find("%sauthor/%sname" % (ns, ns)).text,
post.find("%sauthor/%semail" % (ns, ns)).text))
print(textwrap.fill(post.find("%smessage" % ns).text,
initial_indent=" ", subsequent_indent=" "))
print("")

View File

@ -38,7 +38,9 @@ setup(
],
install_requires=requires,
entry_points={
'console_scripts':
['isso = isso:main'],
'console_scripts': [
'isso = isso:main',
'wynaut = wynaut:main'
],
},
)

View File

@ -6,7 +6,7 @@ from os.path import join, dirname
from isso.core import Config
from isso.db import SQLite3
from isso.migrate import disqus
from wynaut.imprt import Disqus
def test_disqus():
@ -15,12 +15,13 @@ def test_disqus():
xxx = tempfile.NamedTemporaryFile()
db = SQLite3(xxx.name, Config.load(None))
disqus(db, xml)
dsq = Disqus(xml)
dsq.migrate(db)
assert db.threads["/"]["title"] == "Hello, World!"
assert db.threads["/"]["id"] == 1
a = db.comments.get(1)
assert a["author"] == "peter"

52
wynaut/__init__.py Normal file
View File

@ -0,0 +1,52 @@
# -*- encoding: utf-8 -*-
import pkg_resources
dist = pkg_resources.get_distribution("isso")
import os
import tempfile
from argparse import ArgumentParser
from isso.db import SQLite3
from isso.core import Config
from wynaut.imprt import Disqus
try:
input = raw_input
except NameError:
pass
def main():
parser = ArgumentParser(description="manage Isso")
subparser = parser.add_subparsers(help="commands", dest="command")
parser.add_argument('--version', action='version', version='%(prog)s' + dist.version)
parser.add_argument('-c', dest="conf", default=os.environ.get("ISSO_SETTINGS"),
metavar="/etc/isso.conf", help="set configuration file")
imprt = subparser.add_parser('import', help="import Disqus XML export")
imprt.add_argument("dump", metavar="FILE")
imprt.add_argument("-f", "--force", dest="force", action="store_true",
help="force actions")
imprt.add_argument("-n", "--dry-run", dest="dryrun", action="store_true",
help="perform a trial run with no changes made")
args = parser.parse_args()
conf = Config.load(args.conf)
if args.command == "import":
xxx = tempfile.NamedTemporaryFile()
dbpath = conf.get("general", "dbpath") if not args.dryrun else xxx.name
dsq = Disqus(args.dump)
db = SQLite3(dbpath, conf)
if db.execute("SELECT * FROM comments").fetchone():
if not args.force and input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
raise SystemExit("Abort.")
dsq.migrate(db)

143
wynaut/imprt/__init__.py Normal file
View File

@ -0,0 +1,143 @@
# -*- encoding: utf-8 -*-
from __future__ import division
import sys
import os
import time
import textwrap
from time import mktime, strptime
from xml.etree import ElementTree
from collections import defaultdict
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
from werkzeug.utils import cached_property
class Import(object):
def __init__(self):
self.last = 0
try:
self.cols = int(os.popen('stty size', 'r').read().split()[1])
except IndexError:
self.cols = 25
def progress(self, current, max, msg):
if time.time() - self.last < 0.1:
return
sys.stdout.write("\r{0}".format(" "*self.cols))
sys.stdout.write("\r[{0:.3}%] {1:.{2}}".format(
current/max*100, msg.strip(), self.cols - 9))
sys.stdout.flush()
self.last = time.time()
def done(self, msg):
sys.stdout.write("\r{0}".format(" "*self.cols))
sys.stdout.write("\r[100%] {0}\n".format(msg.strip()))
sys.stdout.flush()
class Disqus(Import):
ns = '{http://disqus.com}'
internals = '{http://disqus.com/disqus-internals}'
def __init__(self, xmlfile):
super(Disqus, self).__init__()
self.tree = ElementTree.parse(xmlfile)
self._threads = set([])
self._posts = set([])
@cached_property
def threads(self):
return [thr for thr in self.tree.findall("{0}thread".format(Disqus.ns))
if thr.find("{0}id".format(Disqus.ns)).text is not None]
@cached_property
def posts(self):
return self.tree.findall("{0}post".format(Disqus.ns))
def migrate(self, db):
# map thread id to list of posts
rv = defaultdict(list)
for post in self.posts:
item = {
'dsq:id': post.attrib.get(Disqus.internals + 'id'),
'text': post.find('%smessage' % Disqus.ns).text,
'author': post.find('{0}author/{0}name'.format(Disqus.ns)).text,
'email': post.find('{0}author/{0}email'.format(Disqus.ns)).text,
'created': mktime(strptime(
post.find('%screatedAt' % Disqus.ns).text, '%Y-%m-%dT%H:%M:%SZ')),
'remote_addr': '127.0.0.0',
'mode': 1 if post.find("%sisDeleted" % Disqus.ns).text == "false" else 4
}
if post.find(Disqus.ns + 'parent') is not None:
item['dsq:parent'] = post.find(Disqus.ns + 'parent').attrib.get(Disqus.internals + 'id')
rv[post.find('%sthread' % Disqus.ns).attrib.get(Disqus.internals + 'id')].append(item)
for i, thread in enumerate(self.threads):
self.progress(i, len(self.threads), thread.find('{0}id'.format(Disqus.ns)).text)
# skip (possibly?) duplicate, but empty thread elements
if thread.find('%sid' % Disqus.ns).text is None:
continue
id = thread.attrib.get(Disqus.internals + 'id')
if id in rv:
self._threads.add(id)
self._insert(db, thread, rv[id])
# in case a comment has been deleted (and no further childs)
db.comments._remove_stale()
self.done("{0} threads, {1} comments".format(len(self._threads), len(self._posts)))
orphans = set(map(lambda e: e.attrib.get(Disqus.internals + "id"), self.posts)) - self._posts
if orphans:
print("Found %i orphans:" % len(orphans))
for post in self.posts:
if post.attrib.get(Disqus.internals + "id") not in orphans:
continue
print(" * %s by %s <%s>" % (post.attrib.get(Disqus.internals + "id"),
post.find("{0}author/{0}name".format(Disqus.ns)).text,
post.find("{0}author/{0}email".format(Disqus.ns)).text))
print(textwrap.fill(post.find("%smessage" % Disqus.ns).text,
initial_indent=" ", subsequent_indent=" "))
def _insert(self, db, thread, posts):
path = urlparse(thread.find('%slink' % Disqus.ns).text).path
remap = dict()
if path not in db.threads:
db.threads.new(path, thread.find('%stitle' % Disqus.ns).text.strip())
for item in sorted(posts, key=lambda k: k['created']):
dsq_id = item.pop('dsq:id')
item['parent'] = remap.get(item.pop('dsq:parent', None))
rv = db.comments.add(path, item)
remap[dsq_id] = rv["id"]
self._posts.update(set(remap.keys()))