write script to download top100 gutenberg books
This commit is contained in:
parent
182b20a3ee
commit
c2379ff768
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
node_modules/
|
||||
build/
|
||||
tmp/
|
||||
tmp/
|
||||
books/
|
66
scripts/downloadTop.js
Normal file
66
scripts/downloadTop.js
Normal file
@ -0,0 +1,66 @@
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const axios = require('axios')
|
||||
|
||||
const booksDir = path.join(__dirname, '..', 'books')
|
||||
|
||||
function zip(...arrs) {
|
||||
const resultLength = Math.min(...arrs.map(a => a.length))
|
||||
return new Array(resultLength).fill(0).map((_, i) => arrs.map(a => a[i]))
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const scoresPage = await axios
|
||||
.get('https://www.gutenberg.org/browse/scores/top')
|
||||
.then(res => res.data)
|
||||
const top100listItems = scoresPage.match(/<li>.*<\/li>/g).slice(0, 100)
|
||||
|
||||
const ids = []
|
||||
const titles = []
|
||||
|
||||
for (let listItem of top100listItems) {
|
||||
const [, id, title] = listItem.match(/<a href="\/ebooks\/(.*?)">(.*?)<\/a>/)
|
||||
ids.push(id)
|
||||
titles.push(title)
|
||||
}
|
||||
|
||||
for (let id of ids) {
|
||||
fetchBook(id)
|
||||
.then(text => writeBookToFile(id, text))
|
||||
.catch(err => {
|
||||
console.warn(err)
|
||||
})
|
||||
}
|
||||
|
||||
try {
|
||||
fs.mkdirSync(booksDir)
|
||||
} catch {}
|
||||
|
||||
const indexFilePath = path.join(booksDir, 'index.json')
|
||||
const indexObj = Object.fromEntries(zip(ids, titles))
|
||||
fs.writeFileSync(indexFilePath, JSON.stringify(indexObj, undefined, 2))
|
||||
}
|
||||
|
||||
async function fetchBook(id) {
|
||||
try {
|
||||
return await axios
|
||||
.get(`https://www.gutenberg.org/files/${id}/${id}-0.txt`)
|
||||
.then(res => res.data)
|
||||
} catch {}
|
||||
try {
|
||||
return await axios(`https://www.gutenberg.org/ebooks/${id}.txt.utf-8`).then(
|
||||
res => res.data
|
||||
)
|
||||
} catch {}
|
||||
throw Error(`Could not fetch book with id ${id}`)
|
||||
}
|
||||
|
||||
function writeBookToFile(id, text) {
|
||||
try {
|
||||
fs.mkdirSync(booksDir)
|
||||
} catch {}
|
||||
const filepath = path.join(booksDir, `${id}.txt`)
|
||||
fs.writeFileSync(filepath, text)
|
||||
}
|
||||
|
||||
main()
|
Loading…
Reference in New Issue
Block a user