You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rsvp-reader/scripts/process-metadata.py

27 lines
719 B
Python

import sys
from collections import defaultdict
import json
with open(sys.argv[1], 'r') as f:
data = json.load(f)
# new data with stripped information
new_data = list()
# generate stat files for several keys
stats_for = ['language', 'rights', 'subject']
stats = {key: defaultdict(int) for key in stats_for}
for entry in data.values():
for stat_key in stats_for:
for value in entry[stat_key]:
stats[stat_key][value] += 1
new_data.append({key: entry[key] for key in entry if key != 'formaturi'})
for stat in stats.keys():
with open('data/stats_' + stat + '.json', 'w') as f:
json.dump(stats[stat], f)
with open('data/gutenberg.json', 'w') as f:
json.dump(new_data, f)