#! /usr/bin/env python3 import json import os import base64 from PIL import Image import multiprocessing as mp from io import BytesIO def data_url_to_bin(image_data_url): f = image_data_url pref = "data:image/jpeg;base64," if f and f.startswith(pref): return base64.b64decode(f[len(pref):]) pref = "data:image/png;base64," if f and f.startswith(pref): return base64.b64decode(f[len(pref):]) return None def get_files(d): for root, dirs, files in os.walk(d): for file in files: yield os.path.join(root, file) class Record: def __init__(self, file): self.file = file self.basename = os.path.basename(self.file) def load(self): self.data = json.load(open(self.file)) def make_thumb(self, path): img = data_url_to_bin(self.data["image_data_url"]) if img: binary = BytesIO(img) img = Image.open(binary) img.thumbnail((100, 100)) img.save(f"{path}/{self.basename}.jpg") def load_samples(raw_dir): for file in get_files(raw_dir): try: rec = Record(file) yield rec except Exception as e: print(f"Error loading {file}: {e}") def process_one(rec): try: do_process_one(rec) except Exception as e: print(f"Error processing {rec.file}: {e}") raise def do_process_one(rec): rec.load() rec.make_thumb("data/thumb") def main(): samples = list(load_samples("data/raw")) print(len(samples)) with mp.Pool(mp.cpu_count() - 1) as pool: pool.map(process_one, samples) if __name__ == "__main__": main()