rsvp-reader/scripts/process-metadata.py

31 lines
832 B
Python

import sys
from collections import defaultdict
import json
with open(sys.argv[1], 'r') as f:
data = json.load(f)
# new data with stripped information
new_data = list()
# generate stat files for several keys
stats_for = ['language', 'rights', 'subject']
stats = {key: defaultdict(int) for key in stats_for}
for book_id, entry in data.items():
# strip formaturi from entry
new_entry = {key: entry[key] for key in entry if key != 'formaturi'}
new_entry['id'] = book_id
new_data.append(new_entry)
# add stats
for stat_key in stats_for:
for value in entry[stat_key]:
stats[stat_key][value] += 1
for stat in stats.keys():
with open('data/stats_' + stat + '.json', 'w') as f:
json.dump(stats[stat], f)
with open('data/gutenberg.json', 'w') as f:
json.dump(new_data, f)