write script to download top100 gutenberg books
This commit is contained in:
		
							parent
							
								
									182b20a3ee
								
							
						
					
					
						commit
						c2379ff768
					
				
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -1,3 +1,4 @@
 | 
			
		||||
node_modules/
 | 
			
		||||
build/
 | 
			
		||||
tmp/
 | 
			
		||||
tmp/
 | 
			
		||||
books/
 | 
			
		||||
							
								
								
									
										66
									
								
								scripts/downloadTop.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								scripts/downloadTop.js
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,66 @@
 | 
			
		||||
const fs = require('fs')
 | 
			
		||||
const path = require('path')
 | 
			
		||||
const axios = require('axios')
 | 
			
		||||
 | 
			
		||||
const booksDir = path.join(__dirname, '..', 'books')
 | 
			
		||||
 | 
			
		||||
function zip(...arrs) {
 | 
			
		||||
  const resultLength = Math.min(...arrs.map(a => a.length))
 | 
			
		||||
  return new Array(resultLength).fill(0).map((_, i) => arrs.map(a => a[i]))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async function main() {
 | 
			
		||||
  const scoresPage = await axios
 | 
			
		||||
    .get('https://www.gutenberg.org/browse/scores/top')
 | 
			
		||||
    .then(res => res.data)
 | 
			
		||||
  const top100listItems = scoresPage.match(/<li>.*<\/li>/g).slice(0, 100)
 | 
			
		||||
 | 
			
		||||
  const ids = []
 | 
			
		||||
  const titles = []
 | 
			
		||||
 | 
			
		||||
  for (let listItem of top100listItems) {
 | 
			
		||||
    const [, id, title] = listItem.match(/<a href="\/ebooks\/(.*?)">(.*?)<\/a>/)
 | 
			
		||||
    ids.push(id)
 | 
			
		||||
    titles.push(title)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (let id of ids) {
 | 
			
		||||
    fetchBook(id)
 | 
			
		||||
      .then(text => writeBookToFile(id, text))
 | 
			
		||||
      .catch(err => {
 | 
			
		||||
        console.warn(err)
 | 
			
		||||
      })
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  try {
 | 
			
		||||
    fs.mkdirSync(booksDir)
 | 
			
		||||
  } catch {}
 | 
			
		||||
 | 
			
		||||
  const indexFilePath = path.join(booksDir, 'index.json')
 | 
			
		||||
  const indexObj = Object.fromEntries(zip(ids, titles))
 | 
			
		||||
  fs.writeFileSync(indexFilePath, JSON.stringify(indexObj, undefined, 2))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async function fetchBook(id) {
 | 
			
		||||
  try {
 | 
			
		||||
    return await axios
 | 
			
		||||
      .get(`https://www.gutenberg.org/files/${id}/${id}-0.txt`)
 | 
			
		||||
      .then(res => res.data)
 | 
			
		||||
  } catch {}
 | 
			
		||||
  try {
 | 
			
		||||
    return await axios(`https://www.gutenberg.org/ebooks/${id}.txt.utf-8`).then(
 | 
			
		||||
      res => res.data
 | 
			
		||||
    )
 | 
			
		||||
  } catch {}
 | 
			
		||||
  throw Error(`Could not fetch book with id ${id}`)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
function writeBookToFile(id, text) {
 | 
			
		||||
  try {
 | 
			
		||||
    fs.mkdirSync(booksDir)
 | 
			
		||||
  } catch {}
 | 
			
		||||
  const filepath = path.join(booksDir, `${id}.txt`)
 | 
			
		||||
  fs.writeFileSync(filepath, text)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main()
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user