Add gutenberg data
This commit is contained in:
		
							parent
							
								
									34db0ea358
								
							
						
					
					
						commit
						80979e0d8c
					
				
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -1,2 +1,3 @@
 | 
			
		||||
node_modules/
 | 
			
		||||
build/
 | 
			
		||||
build/
 | 
			
		||||
tmp/
 | 
			
		||||
							
								
								
									
										1
									
								
								data/gutenberg.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data/gutenberg.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										1
									
								
								data/stats_language.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data/stats_language.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
{"en": 48484, "la": 121, "es": 628, "de": 1751, "fr": 2959, "it": 752, "ja": 22, "zh": 441, "sv": 193, "da": 68, "cy": 13, "bg": 6, "pt": 552, "nl": 800, "el": 220, "he": 6, "ru": 9, "hu": 183, "ko": 1, "pl": 31, "fi": 1994, "eo": 118, "enm": 6, "sa": 1, "ang": 4, "ale": 1, "yi": 1, "lt": 1, "nai": 3, "sr": 4, "no": 19, "ca": 33, "ro": 2, "nah": 3, "kha": 1, "cs": 10, "tl": 60, "is": 7, "myn": 2, "ilo": 3, "ia": 1, "ga": 2, "fur": 7, "af": 4, "kld": 1, "oc": 1, "nap": 1, "fy": 2, "ceb": 3, "gl": 2, "mi": 2, "nav": 3, "br": 1, "arp": 2, "iu": 1, "bgs": 1, "csb": 1, "gla": 2, "rmr": 1, "sl": 1, "te": 6, "oji": 1, "grc": 3, "ar": 1, "et": 1, "fa": 1, "brx": 2}
 | 
			
		||||
							
								
								
									
										1
									
								
								data/stats_rights.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data/stats_rights.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
{"Public domain in the USA.": 58354, "Copyrighted. Read the copyright notice inside this book for details.": 955, "None": 69}
 | 
			
		||||
							
								
								
									
										1
									
								
								data/stats_subject.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data/stats_subject.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										13
									
								
								scripts/generate-data.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										13
									
								
								scripts/generate-data.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,13 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
METADATA_URL='https://github.com/hugovk/gutenberg-metadata/raw/master/gutenberg-metadata.json'
 | 
			
		||||
METADATA_FILE='tmp/gutenberg-metadata.json'
 | 
			
		||||
 | 
			
		||||
mkdir -p tmp
 | 
			
		||||
 | 
			
		||||
# download metadata file if it does not exists
 | 
			
		||||
if [ ! -f $METADATA_FILE ]; then
 | 
			
		||||
  wget -O $METADATA_FILE $METADATA_URL 
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
python3 scripts/process-metadata.py $METADATA_FILE
 | 
			
		||||
							
								
								
									
										26
									
								
								scripts/process-metadata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								scripts/process-metadata.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
			
		||||
import sys
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
with open(sys.argv[1], 'r') as f:
 | 
			
		||||
    data = json.load(f)
 | 
			
		||||
 | 
			
		||||
# new data with stripped information
 | 
			
		||||
new_data = list()
 | 
			
		||||
 | 
			
		||||
# generate stat files for several keys
 | 
			
		||||
stats_for = ['language', 'rights', 'subject']
 | 
			
		||||
stats = {key: defaultdict(int) for key in stats_for}
 | 
			
		||||
 | 
			
		||||
for entry in data.values():
 | 
			
		||||
    for stat_key in stats_for:
 | 
			
		||||
        for value in entry[stat_key]:
 | 
			
		||||
            stats[stat_key][value] += 1
 | 
			
		||||
    new_data.append({key: entry[key] for key in entry if key != 'formaturi'})
 | 
			
		||||
 | 
			
		||||
for stat in stats.keys():
 | 
			
		||||
    with open('data/stats_' + stat + '.json', 'w') as f:
 | 
			
		||||
        json.dump(stats[stat], f)
 | 
			
		||||
 | 
			
		||||
with open('data/gutenberg.json', 'w') as f:
 | 
			
		||||
    json.dump(new_data, f)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user