From 81279be189a60c50d0b881f0a848ba0739fc3cbb Mon Sep 17 00:00:00 2001 From: Christine Dodrill Date: Mon, 27 Jul 2015 18:44:26 -0700 Subject: [PATCH] scrape the posts into a sqlite database --- .gitignore | 1 + db/db.sql | 2 +- db/rebuilddb.sh | 7 +++++++ scripts/scrape.moon | 23 +++++++++++++++++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) create mode 100755 db/rebuilddb.sh create mode 100644 scripts/scrape.moon diff --git a/.gitignore b/.gitignore index fd88510..6067f39 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.db +*.db-journal *.a *.o *.so diff --git a/db/db.sql b/db/db.sql index 826e769..79356e7 100644 --- a/db/db.sql +++ b/db/db.sql @@ -4,7 +4,7 @@ CREATE TABLE IF NOT EXISTS Posts ( , body TEXT NOT NULL , mdown TEXT NOT NULL , author TEXT NOT NULL - , page TEXT NOT NULL + , page INTEGER ); CREATE TABLE IF NOT EXISTS Users ( diff --git a/db/rebuilddb.sh b/db/rebuilddb.sh new file mode 100755 index 0000000..95452eb --- /dev/null +++ b/db/rebuilddb.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e +set -x + +rm ./posts.db +cat ./db.sql | sqlite3 ./posts.db diff --git a/scripts/scrape.moon b/scripts/scrape.moon new file mode 100644 index 0000000..33ee919 --- /dev/null +++ b/scripts/scrape.moon @@ -0,0 +1,23 @@ +json = require "dkjson" +sqlite3 = require "lsqlite3" + +db = assert sqlite3.open "../db/posts.db" + +insert_stmt = assert db\prepare "INSERT INTO Posts VALUES (NULL, ?, ?, '', ?, ?)" + +for page=1,8002 + print "Scraping page #{page}..." + + with fin = assert io.open "../raw/pages/#{page}.json", "r" + data = fin\read "*a" + posts = json.decode data + + for _, post in pairs posts.topics + do + insert_stmt\bind_values post.id, post.body, post.author, page + insert_stmt\step! + insert_stmt\reset! + + fin\close! + + print "done"