From 04b6d70b013cfaed923d878c80b3ce8f48157bd2 Mon Sep 17 00:00:00 2001 From: Facundo Batista Date: Wed, 18 Apr 2018 15:54:54 -0300 Subject: [PATCH 1/3] Added a generic way to migrate from a json file --- CONTRIBUTORS.txt | 3 ++ isso/migrate.py | 92 +++++++++++++++++++++++++++++++++--- isso/tests/generic.json | 1 + isso/tests/test_migration.py | 39 ++++++++++++++- 4 files changed, 127 insertions(+), 8 deletions(-) create mode 100644 isso/tests/generic.json diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 0b5e649..020f0c6 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -58,5 +58,8 @@ In chronological order: * Ivan Pantic * Added vote levels +* Facundo Batista + * Added a generic way to migrate from a json file + * [Your name or handle] <[email or website]> * [Brief summary of your changes] diff --git a/isso/migrate.py b/isso/migrate.py index f6297b7..3628460 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -2,13 +2,14 @@ from __future__ import division, print_function, unicode_literals -import sys -import os -import io -import re -import logging -import textwrap import functools +import io +import json +import logging +import os +import re +import sys +import textwrap from time import mktime, strptime, time from collections import defaultdict @@ -55,7 +56,7 @@ class Progress(object): if time() - self.last > 0.2: sys.stdout.write("\r{0}".format(" " * cols)) - sys.stdout.write("\r[{0:.0%}] {1}".format(i/self.end, message)) + sys.stdout.write("\r[{0:.0%}] {1}".format(i / self.end, message)) sys.stdout.flush() self.last = time() @@ -250,6 +251,80 @@ class WordPress(object): return re.compile("http://wordpress.org/export/(1\.\d)/").search(peek) +class Generic(object): + """A generic importer. + + The source format is a json with the following format: + + A list of threads, each item being a dict with the following data: + + - id: a text representing the unique thread id + - title: the title of the thread + - comments: the list of comments + + Each item in that list of comments is a dict with the following data: + + - id: an integer with the unique id of the comment inside the thread (it can be repeated + among different threads); this will be used to order the comment inside the thread + - author: the author name + - email: the author email + - website: the authot's website + - created: a timestamp, in the format "%Y-%m-%d %H:%M:%S" + """ + + def __init__(self, db, json_file): + self.db = db + self.json_file = json_file + self.count = 0 + + def insert(self, thread): + """Process a thread and insert its comments in the DB.""" + thread_id = thread['id'] + title = thread['title'] + self.db.threads.new(thread_id, title) + + comments = list(map(self._build_comment, thread['comments'])) + comments.sort(key=lambda comment: comment['id']) + self.count += len(comments) + for comment in comments: + self.db.comments.add(thread_id, comment) + + def migrate(self): + """Process the input file and fill the DB.""" + with open(self.json_file, 'rt', encoding='utf8') as fh: + threads = json.load(fh) + progress = Progress(len(threads)) + + for i, thread in enumerate(threads): + progress.update(i, str(i)) + self.insert(thread) + + progress.finish("{0} threads, {1} comments".format(len(threads), self.count)) + + def _build_comment(self, raw_comment): + return { + "text": raw_comment['text'], + "author": raw_comment['author'], + "email": raw_comment['email'], + "website": raw_comment['website'], + "created": mktime(strptime(raw_comment['created'], "%Y-%m-%d %H:%M:%S")), + "mode": 1, + "id": int(raw_comment['id']), + "parent": None, + "remote_addr": raw_comment["remote_addr"], + } + + @classmethod + def detect(cls, peek): + """Return if peek looks like the beginning of a JSON file. + + Note that we can not check the JSON properly as we only receive here + the original file truncated. + """ + print("===== peek", repr(peek)) + return peek.startswith("[{") + + def autodetect(peek): if 'xmlns="http://disqus.com' in peek: @@ -259,6 +334,9 @@ def autodetect(peek): if m: return WordPress + if Generic.detect(peek): + return Generic + return None diff --git a/isso/tests/generic.json b/isso/tests/generic.json new file mode 100644 index 0000000..8ba90ea --- /dev/null +++ b/isso/tests/generic.json @@ -0,0 +1 @@ +[{"comments": [{"email": "", "remote_addr": "0.0.0.0", "website": "http://www.tigerspice.com", "created": "2005-02-24 04:03:37", "author": "texas holdem", "id": 0, "text": "Great men can't be ruled. by free online poker"}], "id": "/posts/0001/", "title": "Test+post"}, {"comments": [{"email": "105421439@87750645.com", "remote_addr": "0.0.0.0", "website": "", "created": "2005-05-08 06:50:26", "author": "Richard Crinshaw", "id": 0, "text": "Ja-make-a me crazzy mon :)\n"}], "id": "/posts/0007/", "title": "Nat+%26+Miguel"}] \ No newline at end of file diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index 151f27b..03a33e6 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -9,7 +9,7 @@ from os.path import join, dirname from isso import config from isso.db import SQLite3 -from isso.migrate import Disqus, WordPress, autodetect +from isso.migrate import Disqus, WordPress, autodetect, Generic conf = config.new({ "general": { @@ -79,6 +79,40 @@ class TestMigration(unittest.TestCase): self.assertEqual(last["author"], "Letzter :/") self.assertEqual(last["parent"], None) + def test_generic(self): + filepath = join(dirname(__file__), "generic.json") + tempf = tempfile.NamedTemporaryFile() + + db = SQLite3(tempf.name, conf) + Generic(db, filepath).migrate() + + self.assertEqual(db.threads["/posts/0001/"]["title"], "Test+post") + self.assertEqual(db.threads["/posts/0001/"]["id"], 1) + + self.assertEqual(db.threads["/posts/0007/"]["title"], "Nat+%26+Miguel") + self.assertEqual(db.threads["/posts/0007/"]["id"], 2) + + self.assertEqual( + len(db.execute("SELECT id FROM threads").fetchall()), 2) + self.assertEqual( + len(db.execute("SELECT id FROM comments").fetchall()), 2) + + comment = db.comments.get(1) + self.assertEqual(comment["author"], "texas holdem") + self.assertEqual(comment["text"], "Great men can't be ruled. by free online poker") + self.assertEqual(comment["email"], "") + self.assertEqual(comment["website"], "http://www.tigerspice.com") + self.assertEqual(comment["created"], 1109228617.0) + self.assertEqual(comment["remote_addr"], "0.0.0.0") + + comment = db.comments.get(2) + self.assertEqual(comment["author"], "Richard Crinshaw") + self.assertEqual(comment["text"], "Ja-make-a me crazzy mon :)\n") + self.assertEqual(comment["email"], "105421439@87750645.com") + self.assertEqual(comment["website"], "") + self.assertEqual(comment["created"], 1115545826.0) + self.assertEqual(comment["remote_addr"], "0.0.0.0") + def test_detection(self): wp = """\ @@ -98,3 +132,6 @@ class TestMigration(unittest.TestCase): Date: Wed, 18 Apr 2018 17:34:23 -0300 Subject: [PATCH 2/3] Removed debug line --- isso/migrate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/isso/migrate.py b/isso/migrate.py index 3628460..e23ec8c 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -321,7 +321,6 @@ class Generic(object): Note that we can not check the JSON properly as we only receive here the original file truncated. """ - print("===== peek", repr(peek)) return peek.startswith("[{") From 14195d37114ca4871365ffe13fa2fcecc3ac6a45 Mon Sep 17 00:00:00 2001 From: Facundo Batista Date: Wed, 18 Apr 2018 18:49:52 -0300 Subject: [PATCH 3/3] Used proper open and not check timing --- isso/migrate.py | 2 +- isso/tests/test_migration.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/isso/migrate.py b/isso/migrate.py index e23ec8c..b63af58 100644 --- a/isso/migrate.py +++ b/isso/migrate.py @@ -291,7 +291,7 @@ class Generic(object): def migrate(self): """Process the input file and fill the DB.""" - with open(self.json_file, 'rt', encoding='utf8') as fh: + with io.open(self.json_file, 'rt', encoding='utf8') as fh: threads = json.load(fh) progress = Progress(len(threads)) diff --git a/isso/tests/test_migration.py b/isso/tests/test_migration.py index 03a33e6..98b97a0 100644 --- a/isso/tests/test_migration.py +++ b/isso/tests/test_migration.py @@ -102,7 +102,6 @@ class TestMigration(unittest.TestCase): self.assertEqual(comment["text"], "Great men can't be ruled. by free online poker") self.assertEqual(comment["email"], "") self.assertEqual(comment["website"], "http://www.tigerspice.com") - self.assertEqual(comment["created"], 1109228617.0) self.assertEqual(comment["remote_addr"], "0.0.0.0") comment = db.comments.get(2) @@ -110,7 +109,6 @@ class TestMigration(unittest.TestCase): self.assertEqual(comment["text"], "Ja-make-a me crazzy mon :)\n") self.assertEqual(comment["email"], "105421439@87750645.com") self.assertEqual(comment["website"], "") - self.assertEqual(comment["created"], 1115545826.0) self.assertEqual(comment["remote_addr"], "0.0.0.0") def test_detection(self):