import sys from collections import defaultdict import json with open(sys.argv[1], 'r') as f: data = json.load(f) # new data with stripped information new_data = list() # generate stat files for several keys stats_for = ['language', 'rights', 'subject'] stats = {key: defaultdict(int) for key in stats_for} for book_id, entry in data.items(): # strip formaturi from entry new_entry = {key: entry[key] for key in entry if key != 'formaturi'} new_entry['id'] = book_id new_data.append(new_entry) # add stats for stat_key in stats_for: for value in entry[stat_key]: stats[stat_key][value] += 1 for stat in stats.keys(): with open('data/stats_' + stat + '.json', 'w') as f: json.dump(stats[stat], f) with open('data/gutenberg.json', 'w') as f: json.dump(new_data, f)