Write script to fetch and parse PG metadata

This commit is contained in:
Alfred Melch 2020-03-14 13:03:48 +01:00
parent e745c0fa74
commit c479ef609a
4 changed files with 111 additions and 0 deletions

View File

@ -0,0 +1,21 @@
const path = require('path')
const Axios = require('axios')
const { readFile, cacheDir, mkdirAndWriteFile } = require('../paths')
async function getMetadata() {
const cacheFile = path.join(cacheDir, 'gutenberg-metadata.json')
try {
const fileContents = await readFile(cacheFile)
return JSON.parse(fileContents)
} catch (err) {}
console.log('Downloading metadata from GitHub')
const metadata = await Axios.get(
'https://hugovk.github.io/gutenberg-metadata/gutenberg-metadata.json'
).then(res => res.data)
await mkdirAndWriteFile(cacheFile, JSON.stringify(metadata))
return metadata
}
module.exports = { getMetadata }

37
scripts/metadata/index.js Normal file
View File

@ -0,0 +1,37 @@
const path = require('path')
const { getMetadata } = require('./getMetadata')
const { parseMetadata } = require('./parseMetadata')
const { mkdirAndWriteFile, dataDir } = require('../paths')
async function main() {
const parsedData = [...parseMetadata(await getMetadata())]
const tsvData = parsedData.map(entryToRow)
mkdirAndWriteFile(
path.join(dataDir, 'PG-meta.json'),
JSON.stringify(parsedData, undefined, 2)
)
mkdirAndWriteFile(path.join(dataDir, 'PG-meta.tsv'), formatTsv(tsvData))
}
function entryToRow(entry) {
return [
entry.id,
(entry.title[0] || '').replace(/[\r\n]+/gm, ''),
entry.author.join('|'),
entry.language.join('|'),
entry.subject.join('|'),
entry.textUris.join('|'),
entry.pictures.join('|')
]
}
function formatTsv(array2D) {
return array2D
.map(row => row.join('\t'))
.join('\n')
.replace(/"/gm, '\\"')
}
main()

View File

@ -0,0 +1,22 @@
function* parseMetadata(metadata) {
for (const [key, entry] of Object.entries(metadata)) {
entry.textUris = entry.formaturi
.filter(uri => extensionIn(uri, ['.txt', '.utf-8']))
.map(stripPGHost)
entry.pictures = entry.formaturi
.filter(uri => extensionIn(uri, ['.jpg', 'png']))
.map(stripPGHost)
delete entry.formaturi
entry.id = key
yield entry
}
}
function extensionIn(path, endings) {
return endings.some(ending => path.endsWith(ending))
}
function stripPGHost(uri) {
return uri.replace('http://www.gutenberg.org', '')
}
module.exports = { parseMetadata }

31
scripts/paths.js Normal file
View File

@ -0,0 +1,31 @@
const fs = require('fs')
const util = require('util')
const path = require('path')
const projectRoot = path.join(__dirname, '..')
const cacheDir = path.join(projectRoot, 'cache')
const dataDir = path.join(projectRoot, 'data')
const readFile = util.promisify(fs.readFile)
const writeFile = util.promisify(fs.writeFile)
const mkdir = util.promisify(fs.mkdir)
async function mkdirAndWriteFile(filepath, data) {
try {
// create directory
await mkdir(path.dirname(filepath))
} catch (err) {
// ignore "directory exists errors"
if (err.code !== 'EEXIST') throw err
}
return writeFile(filepath, data)
}
module.exports = {
projectRoot,
cacheDir,
dataDir,
readFile,
writeFile,
mkdirAndWriteFile
}