From c479ef609acd1136827b6967df9bb9ecb7c3ac7a Mon Sep 17 00:00:00 2001 From: Alfred Melch Date: Sat, 14 Mar 2020 13:03:48 +0100 Subject: [PATCH] Write script to fetch and parse PG metadata --- scripts/metadata/getMetadata.js | 21 ++++++++++++++++++ scripts/metadata/index.js | 37 +++++++++++++++++++++++++++++++ scripts/metadata/parseMetadata.js | 22 ++++++++++++++++++ scripts/paths.js | 31 ++++++++++++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100644 scripts/metadata/getMetadata.js create mode 100644 scripts/metadata/index.js create mode 100644 scripts/metadata/parseMetadata.js create mode 100644 scripts/paths.js diff --git a/scripts/metadata/getMetadata.js b/scripts/metadata/getMetadata.js new file mode 100644 index 0000000..da1c03f --- /dev/null +++ b/scripts/metadata/getMetadata.js @@ -0,0 +1,21 @@ +const path = require('path') +const Axios = require('axios') +const { readFile, cacheDir, mkdirAndWriteFile } = require('../paths') + +async function getMetadata() { + const cacheFile = path.join(cacheDir, 'gutenberg-metadata.json') + try { + const fileContents = await readFile(cacheFile) + return JSON.parse(fileContents) + } catch (err) {} + + console.log('Downloading metadata from GitHub') + const metadata = await Axios.get( + 'https://hugovk.github.io/gutenberg-metadata/gutenberg-metadata.json' + ).then(res => res.data) + + await mkdirAndWriteFile(cacheFile, JSON.stringify(metadata)) + return metadata +} + +module.exports = { getMetadata } diff --git a/scripts/metadata/index.js b/scripts/metadata/index.js new file mode 100644 index 0000000..33612d7 --- /dev/null +++ b/scripts/metadata/index.js @@ -0,0 +1,37 @@ +const path = require('path') + +const { getMetadata } = require('./getMetadata') +const { parseMetadata } = require('./parseMetadata') +const { mkdirAndWriteFile, dataDir } = require('../paths') + +async function main() { + const parsedData = [...parseMetadata(await getMetadata())] + const tsvData = parsedData.map(entryToRow) + + mkdirAndWriteFile( + path.join(dataDir, 'PG-meta.json'), + JSON.stringify(parsedData, undefined, 2) + ) + mkdirAndWriteFile(path.join(dataDir, 'PG-meta.tsv'), formatTsv(tsvData)) +} + +function entryToRow(entry) { + return [ + entry.id, + (entry.title[0] || '').replace(/[\r\n]+/gm, ''), + entry.author.join('|'), + entry.language.join('|'), + entry.subject.join('|'), + entry.textUris.join('|'), + entry.pictures.join('|') + ] +} + +function formatTsv(array2D) { + return array2D + .map(row => row.join('\t')) + .join('\n') + .replace(/"/gm, '\\"') +} + +main() diff --git a/scripts/metadata/parseMetadata.js b/scripts/metadata/parseMetadata.js new file mode 100644 index 0000000..c939af3 --- /dev/null +++ b/scripts/metadata/parseMetadata.js @@ -0,0 +1,22 @@ +function* parseMetadata(metadata) { + for (const [key, entry] of Object.entries(metadata)) { + entry.textUris = entry.formaturi + .filter(uri => extensionIn(uri, ['.txt', '.utf-8'])) + .map(stripPGHost) + entry.pictures = entry.formaturi + .filter(uri => extensionIn(uri, ['.jpg', 'png'])) + .map(stripPGHost) + delete entry.formaturi + entry.id = key + yield entry + } +} + +function extensionIn(path, endings) { + return endings.some(ending => path.endsWith(ending)) +} +function stripPGHost(uri) { + return uri.replace('http://www.gutenberg.org', '') +} + +module.exports = { parseMetadata } diff --git a/scripts/paths.js b/scripts/paths.js new file mode 100644 index 0000000..e9f1b18 --- /dev/null +++ b/scripts/paths.js @@ -0,0 +1,31 @@ +const fs = require('fs') +const util = require('util') +const path = require('path') + +const projectRoot = path.join(__dirname, '..') +const cacheDir = path.join(projectRoot, 'cache') +const dataDir = path.join(projectRoot, 'data') + +const readFile = util.promisify(fs.readFile) +const writeFile = util.promisify(fs.writeFile) +const mkdir = util.promisify(fs.mkdir) + +async function mkdirAndWriteFile(filepath, data) { + try { + // create directory + await mkdir(path.dirname(filepath)) + } catch (err) { + // ignore "directory exists errors" + if (err.code !== 'EEXIST') throw err + } + return writeFile(filepath, data) +} + +module.exports = { + projectRoot, + cacheDir, + dataDir, + readFile, + writeFile, + mkdirAndWriteFile +}