diff --git a/.gitignore b/.gitignore index d5fcd8f..ae9f357 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules/ build/ -tmp/ \ No newline at end of file +tmp/ +books/ \ No newline at end of file diff --git a/scripts/downloadTop.js b/scripts/downloadTop.js new file mode 100644 index 0000000..07252d8 --- /dev/null +++ b/scripts/downloadTop.js @@ -0,0 +1,66 @@ +const fs = require('fs') +const path = require('path') +const axios = require('axios') + +const booksDir = path.join(__dirname, '..', 'books') + +function zip(...arrs) { + const resultLength = Math.min(...arrs.map(a => a.length)) + return new Array(resultLength).fill(0).map((_, i) => arrs.map(a => a[i])) +} + +async function main() { + const scoresPage = await axios + .get('https://www.gutenberg.org/browse/scores/top') + .then(res => res.data) + const top100listItems = scoresPage.match(/
  • .*<\/li>/g).slice(0, 100) + + const ids = [] + const titles = [] + + for (let listItem of top100listItems) { + const [, id, title] = listItem.match(/(.*?)<\/a>/) + ids.push(id) + titles.push(title) + } + + for (let id of ids) { + fetchBook(id) + .then(text => writeBookToFile(id, text)) + .catch(err => { + console.warn(err) + }) + } + + try { + fs.mkdirSync(booksDir) + } catch {} + + const indexFilePath = path.join(booksDir, 'index.json') + const indexObj = Object.fromEntries(zip(ids, titles)) + fs.writeFileSync(indexFilePath, JSON.stringify(indexObj, undefined, 2)) +} + +async function fetchBook(id) { + try { + return await axios + .get(`https://www.gutenberg.org/files/${id}/${id}-0.txt`) + .then(res => res.data) + } catch {} + try { + return await axios(`https://www.gutenberg.org/ebooks/${id}.txt.utf-8`).then( + res => res.data + ) + } catch {} + throw Error(`Could not fetch book with id ${id}`) +} + +function writeBookToFile(id, text) { + try { + fs.mkdirSync(booksDir) + } catch {} + const filepath = path.join(booksDir, `${id}.txt`) + fs.writeFileSync(filepath, text) +} + +main()