Write script to fetch and parse PG metadata
This commit is contained in:
parent
e745c0fa74
commit
c479ef609a
21
scripts/metadata/getMetadata.js
Normal file
21
scripts/metadata/getMetadata.js
Normal file
@ -0,0 +1,21 @@
|
||||
const path = require('path')
|
||||
const Axios = require('axios')
|
||||
const { readFile, cacheDir, mkdirAndWriteFile } = require('../paths')
|
||||
|
||||
async function getMetadata() {
|
||||
const cacheFile = path.join(cacheDir, 'gutenberg-metadata.json')
|
||||
try {
|
||||
const fileContents = await readFile(cacheFile)
|
||||
return JSON.parse(fileContents)
|
||||
} catch (err) {}
|
||||
|
||||
console.log('Downloading metadata from GitHub')
|
||||
const metadata = await Axios.get(
|
||||
'https://hugovk.github.io/gutenberg-metadata/gutenberg-metadata.json'
|
||||
).then(res => res.data)
|
||||
|
||||
await mkdirAndWriteFile(cacheFile, JSON.stringify(metadata))
|
||||
return metadata
|
||||
}
|
||||
|
||||
module.exports = { getMetadata }
|
37
scripts/metadata/index.js
Normal file
37
scripts/metadata/index.js
Normal file
@ -0,0 +1,37 @@
|
||||
const path = require('path')
|
||||
|
||||
const { getMetadata } = require('./getMetadata')
|
||||
const { parseMetadata } = require('./parseMetadata')
|
||||
const { mkdirAndWriteFile, dataDir } = require('../paths')
|
||||
|
||||
async function main() {
|
||||
const parsedData = [...parseMetadata(await getMetadata())]
|
||||
const tsvData = parsedData.map(entryToRow)
|
||||
|
||||
mkdirAndWriteFile(
|
||||
path.join(dataDir, 'PG-meta.json'),
|
||||
JSON.stringify(parsedData, undefined, 2)
|
||||
)
|
||||
mkdirAndWriteFile(path.join(dataDir, 'PG-meta.tsv'), formatTsv(tsvData))
|
||||
}
|
||||
|
||||
function entryToRow(entry) {
|
||||
return [
|
||||
entry.id,
|
||||
(entry.title[0] || '').replace(/[\r\n]+/gm, ''),
|
||||
entry.author.join('|'),
|
||||
entry.language.join('|'),
|
||||
entry.subject.join('|'),
|
||||
entry.textUris.join('|'),
|
||||
entry.pictures.join('|')
|
||||
]
|
||||
}
|
||||
|
||||
function formatTsv(array2D) {
|
||||
return array2D
|
||||
.map(row => row.join('\t'))
|
||||
.join('\n')
|
||||
.replace(/"/gm, '\\"')
|
||||
}
|
||||
|
||||
main()
|
22
scripts/metadata/parseMetadata.js
Normal file
22
scripts/metadata/parseMetadata.js
Normal file
@ -0,0 +1,22 @@
|
||||
function* parseMetadata(metadata) {
|
||||
for (const [key, entry] of Object.entries(metadata)) {
|
||||
entry.textUris = entry.formaturi
|
||||
.filter(uri => extensionIn(uri, ['.txt', '.utf-8']))
|
||||
.map(stripPGHost)
|
||||
entry.pictures = entry.formaturi
|
||||
.filter(uri => extensionIn(uri, ['.jpg', 'png']))
|
||||
.map(stripPGHost)
|
||||
delete entry.formaturi
|
||||
entry.id = key
|
||||
yield entry
|
||||
}
|
||||
}
|
||||
|
||||
function extensionIn(path, endings) {
|
||||
return endings.some(ending => path.endsWith(ending))
|
||||
}
|
||||
function stripPGHost(uri) {
|
||||
return uri.replace('http://www.gutenberg.org', '')
|
||||
}
|
||||
|
||||
module.exports = { parseMetadata }
|
31
scripts/paths.js
Normal file
31
scripts/paths.js
Normal file
@ -0,0 +1,31 @@
|
||||
const fs = require('fs')
|
||||
const util = require('util')
|
||||
const path = require('path')
|
||||
|
||||
const projectRoot = path.join(__dirname, '..')
|
||||
const cacheDir = path.join(projectRoot, 'cache')
|
||||
const dataDir = path.join(projectRoot, 'data')
|
||||
|
||||
const readFile = util.promisify(fs.readFile)
|
||||
const writeFile = util.promisify(fs.writeFile)
|
||||
const mkdir = util.promisify(fs.mkdir)
|
||||
|
||||
async function mkdirAndWriteFile(filepath, data) {
|
||||
try {
|
||||
// create directory
|
||||
await mkdir(path.dirname(filepath))
|
||||
} catch (err) {
|
||||
// ignore "directory exists errors"
|
||||
if (err.code !== 'EEXIST') throw err
|
||||
}
|
||||
return writeFile(filepath, data)
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
projectRoot,
|
||||
cacheDir,
|
||||
dataDir,
|
||||
readFile,
|
||||
writeFile,
|
||||
mkdirAndWriteFile
|
||||
}
|
Loading…
Reference in New Issue
Block a user