isso/isso/migrate.py

# -*- encoding: utf-8 -*-

from __future__ import division

import sys
import os
import textwrap

from time import mktime, strptime
from collections import defaultdict

try:
    input = raw_input
except NameError:
    pass

try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse

from xml.etree import ElementTree

ns = '{http://disqus.com}'
dsq = '{http://disqus.com/disqus-internals}'

threads = set([])
comments = set([])


def insert(db, thread, posts):

    path = urlparse(thread.find('%slink' % ns).text).path
    remap = dict()

    if path not in db.threads:
        db.threads.new(path, thread.find('%stitle' % ns).text.strip())

    for item in sorted(posts, key=lambda k: k['created']):

        dsq_id = item.pop('dsq:id')
        item['parent'] = remap.get(item.pop('dsq:parent', None))
        rv = db.comments.add(path, item)
        remap[dsq_id] = rv["id"]

    comments.update(set(remap.keys()))


def disqus(db, xmlfile):

    if db.execute("SELECT * FROM comments").fetchone():
        if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):
            raise SystemExit("Abort.")

    tree = ElementTree.parse(xmlfile)
    res = defaultdict(list)

    for post in tree.findall('%spost' % ns):

        item = {
            'dsq:id': post.attrib.get(dsq + 'id'),
            'text': post.find('%smessage' % ns).text,
            'author': post.find('%sauthor/%sname' % (ns, ns)).text,
            'email': post.find('%sauthor/%semail' % (ns, ns)).text,
            'created': mktime(strptime(
                post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')),
            'remote_addr': '127.0.0.0',
            'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4
        }

        if post.find(ns + 'parent') is not None:
            item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id')

        res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item)

    num = len(tree.findall('%sthread' % ns))
    cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1])

    for i, thread in enumerate(tree.findall('%sthread' % ns)):

        if int(round((i+1)/num, 2) * 100) % 13 == 0:

            sys.stdout.write("\r%s" % (" "*cols))
            sys.stdout.write("\r[%i%%]  %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text))
            sys.stdout.flush()

        # skip (possibly?) duplicate, but empty thread elements
        if thread.find('%sid' % ns).text is None:
            continue

        id = thread.attrib.get(dsq + 'id')
        if id in res:
            threads.add(id)
            insert(db, thread, res[id])

    # in case a comment has been deleted (and no further childs)
    db.comments._remove_stale()

    sys.stdout.write("\r%s" % (" "*cols))
    sys.stdout.write("\r[100%%]  %i threads, %i comments\n" % (len(threads), len(comments)))

    orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments
    if orphans:
        print("Found %i orphans:" % len(orphans))
        for post in tree.findall("%spost" % ns):
            if post.attrib.get(dsq + "id") not in orphans:
                continue

            print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"),
                                        post.find("%sauthor/%sname" % (ns, ns)).text,
                                        post.find("%sauthor/%semail" % (ns, ns)).text))
            print(textwrap.fill(post.find("%smessage" % ns).text,
                                initial_indent="  ", subsequent_indent="  "))
            print("")
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`# -- encoding: utf-8 --`

add progress bar to migration 2013-09-02 12:22:08 +00:00			`from __future__ import division`

			`import sys`
			`import os`
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`import textwrap`
add progress bar to migration 2013-09-02 12:22:08 +00:00
clean db interface and comment model 2013-09-08 11:02:25 +00:00			`from time import mktime, strptime`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`from collections import defaultdict`

ask to continue import if DB is not empty 2013-11-11 11:02:49 +00:00			`try:`
			`input = raw_input`
			`except NameError:`
			`pass`

support for Python 3.3 2013-10-09 14:28:54 +00:00			`try:`
			`from urlparse import urlparse`
			`except ImportError:`
			`from urllib.parse import urlparse`

`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`from xml.etree import ElementTree`

			`ns = '{http://disqus.com}'`
			`dsq = '{http://disqus.com/disqus-internals}'`

show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`threads = set([])`
			`comments = set([])`

`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`def insert(db, thread, posts):`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
use <link> tag to extract the relative post url, fixes #37 The <id> tag does not necessarily contains the full URL, but also relative URLs: <id>http://example.com/foo/bar.html</id> <id>/foo/bar.html</id> <id>foo/bar.html</id> 2013-11-16 19:30:48 +00:00			`path = urlparse(thread.find('%slink' % ns).text).path`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`remap = dict()`

rewrite db backend 2013-09-19 16:30:46 +00:00			`if path not in db.threads:`
			`db.threads.new(path, thread.find('%stitle' % ns).text.strip())`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`for item in sorted(posts, key=lambda k: k['created']):`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
rewrite db backend 2013-09-19 16:30:46 +00:00			`dsq_id = item.pop('dsq:id')`
			`item['parent'] = remap.get(item.pop('dsq:parent', None))`
			`rv = db.comments.add(path, item)`
			`remap[dsq_id] = rv["id"]`
send notification for new comments This commit also introduces a new db which maps path to thread title. The title is read by parsing the HTML for a related <h1> tag using `html5lib`. You can set up SMTP in your configuration (here the defaults): [SMTP] host = localhost port = 465 ssl = on username = password = recipient = sender = In short, by default Isso uses a local SMTP server using SSL without any authentication. An email is send on comment creation to "recipient" from "Ich schrei sonst <sender>". This commit also uses a simple ANSI colorization module from my static blog compiler project. On server startup, Isso will connect to the SMTP server and fall back to a null mailer. It also tries to connect to your website, so if that doesn't work, you probably can't comment on your website either. 2013-09-13 13:21:18 +00:00
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`comments.update(set(remap.keys()))`

`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
add progress bar to migration 2013-09-02 12:22:08 +00:00			`def disqus(db, xmlfile):`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00
ask to continue import if DB is not empty 2013-11-11 11:02:49 +00:00			`if db.execute("SELECT * FROM comments").fetchone():`
			`if input("Isso DB is not empty! Continue? [y/N]: ") not in ("y", "Y"):`
			`raise SystemExit("Abort.")`

add progress bar to migration 2013-09-02 12:22:08 +00:00			`tree = ElementTree.parse(xmlfile)`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`res = defaultdict(list)`

			`for post in tree.findall('%spost' % ns):`

			`item = {`
			`'dsq:id': post.attrib.get(dsq + 'id'),`
			`'text': post.find('%smessage' % ns).text,`
			`'author': post.find('%sauthor/%sname' % (ns, ns)).text,`
			`'email': post.find('%sauthor/%semail' % (ns, ns)).text,`
			`'created': mktime(strptime(`
rewrite db backend 2013-09-19 16:30:46 +00:00			`post.find('%screatedAt' % ns).text, '%Y-%m-%dT%H:%M:%SZ')),`
fix import regression from 11246f5 2013-10-13 13:33:57 +00:00			`'remote_addr': '127.0.0.0',`
set mode to 4 (deleted) when isDeleted is true 2013-11-11 10:44:32 +00:00			`'mode': 1 if post.find("%sisDeleted" % ns).text == "false" else 4`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`}`

			`if post.find(ns + 'parent') is not None:`
			`item['dsq:parent'] = post.find(ns + 'parent').attrib.get(dsq + 'id')`

			`res[post.find('%sthread' % ns).attrib.get(dsq + 'id')].append(item)`

add progress bar to migration 2013-09-02 12:22:08 +00:00			`num = len(tree.findall('%sthread' % ns))`
add test for disqus import 2013-12-18 11:51:14 +00:00			`cols = int((os.popen('stty size', 'r').read() or "25 80").split()[1])`
add progress bar to migration 2013-09-02 12:22:08 +00:00
			`for i, thread in enumerate(tree.findall('%sthread' % ns)):`

			`if int(round((i+1)/num, 2) * 100) % 13 == 0:`

			`sys.stdout.write("\r%s" % (" "*cols))`
			`sys.stdout.write("\r[%i%%] %s" % (((i+1)/num * 100), thread.find('%sid' % ns).text))`
			`sys.stdout.flush()`

show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`# skip (possibly?) duplicate, but empty thread elements`
skip empty Disqus threads 2013-10-19 10:45:10 +00:00			`if thread.find('%sid' % ns).text is None:`
			`continue`

`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`id = thread.attrib.get(dsq + 'id')`
			`if id in res:`
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`threads.add(id)`
`isso import FILE` can import Disqus export 2012-10-23 18:36:43 +00:00			`insert(db, thread, res[id])`
add progress bar to migration 2013-09-02 12:22:08 +00:00
set mode to 4 (deleted) when isDeleted is true 2013-11-11 10:44:32 +00:00			`# in case a comment has been deleted (and no further childs)`
			`db.comments._remove_stale()`

add progress bar to migration 2013-09-02 12:22:08 +00:00			`sys.stdout.write("\r%s" % (" "*cols))`
show orphan comments after Disqus import (if any) An orphan comment is exported by Disqus but its thread id is non-existent (probably deleted, moved). Usually from the earlier days (or WordPress migration). It is not possible to get the thread without manual intervention ( aka SQLite insertions). 2013-11-11 10:34:13 +00:00			`sys.stdout.write("\r[100%%] %i threads, %i comments\n" % (len(threads), len(comments)))`

			`orphans = set(map(lambda e: e.attrib.get(dsq + "id"), tree.findall("%spost" % ns))) - comments`
			`if orphans:`
			`print("Found %i orphans:" % len(orphans))`
			`for post in tree.findall("%spost" % ns):`
			`if post.attrib.get(dsq + "id") not in orphans:`
			`continue`

			`print(" * %s by %s <%s>" % (post.attrib.get(dsq + "id"),`
			`post.find("%sauthor/%sname" % (ns, ns)).text,`
			`post.find("%sauthor/%semail" % (ns, ns)).text))`
			`print(textwrap.fill(post.find("%smessage" % ns).text,`
			`initial_indent=" ", subsequent_indent=" "))`
			`print("")`