write script to download top100 gutenberg books

This commit is contained in:
Alfred Melch 2020-03-13 11:57:25 +01:00
parent 182b20a3ee
commit c2379ff768
2 changed files with 68 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
node_modules/
build/
tmp/
tmp/
books/

66
scripts/downloadTop.js Normal file
View File

@ -0,0 +1,66 @@
const fs = require('fs')
const path = require('path')
const axios = require('axios')
const booksDir = path.join(__dirname, '..', 'books')
function zip(...arrs) {
const resultLength = Math.min(...arrs.map(a => a.length))
return new Array(resultLength).fill(0).map((_, i) => arrs.map(a => a[i]))
}
async function main() {
const scoresPage = await axios
.get('https://www.gutenberg.org/browse/scores/top')
.then(res => res.data)
const top100listItems = scoresPage.match(/<li>.*<\/li>/g).slice(0, 100)
const ids = []
const titles = []
for (let listItem of top100listItems) {
const [, id, title] = listItem.match(/<a href="\/ebooks\/(.*?)">(.*?)<\/a>/)
ids.push(id)
titles.push(title)
}
for (let id of ids) {
fetchBook(id)
.then(text => writeBookToFile(id, text))
.catch(err => {
console.warn(err)
})
}
try {
fs.mkdirSync(booksDir)
} catch {}
const indexFilePath = path.join(booksDir, 'index.json')
const indexObj = Object.fromEntries(zip(ids, titles))
fs.writeFileSync(indexFilePath, JSON.stringify(indexObj, undefined, 2))
}
async function fetchBook(id) {
try {
return await axios
.get(`https://www.gutenberg.org/files/${id}/${id}-0.txt`)
.then(res => res.data)
} catch {}
try {
return await axios(`https://www.gutenberg.org/ebooks/${id}.txt.utf-8`).then(
res => res.data
)
} catch {}
throw Error(`Could not fetch book with id ${id}`)
}
function writeBookToFile(id, text) {
try {
fs.mkdirSync(booksDir)
} catch {}
const filepath = path.join(booksDir, `${id}.txt`)
fs.writeFileSync(filepath, text)
}
main()