From 28dac7da3ff2223add8b5f04455b718ba95eac50 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 21 Feb 2025 21:37:29 +0000 Subject: [PATCH] research: Add process.py --- research/process.py | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 research/process.py diff --git a/research/process.py b/research/process.py new file mode 100755 index 0000000..cdf34bd --- /dev/null +++ b/research/process.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python3 +import json +import os +import base64 +from PIL import Image +import multiprocessing as mp +from io import BytesIO + +def data_url_to_bin(image_data_url): + f = image_data_url + pref = "data:image/jpeg;base64," + if f and f.startswith(pref): + return base64.b64decode(f[len(pref):]) + pref = "data:image/png;base64," + if f and f.startswith(pref): + return base64.b64decode(f[len(pref):]) + return None + +def get_files(d): + for root, dirs, files in os.walk(d): + for file in files: + yield os.path.join(root, file) + +class Record: + def __init__(self, file): + self.file = file + self.basename = os.path.basename(self.file) + + def load(self): + self.data = json.load(open(self.file)) + + def make_thumb(self, path): + img = data_url_to_bin(self.data["image_data_url"]) + if img: + binary = BytesIO(img) + img = Image.open(binary) + img.thumbnail((100, 100)) + img.save(f"{path}/{self.basename}.jpg") + +def load_samples(raw_dir): + for file in get_files(raw_dir): + try: + rec = Record(file) + yield rec + except Exception as e: + print(f"Error loading {file}: {e}") + +def process_one(rec): + try: + do_process_one(rec) + except Exception as e: + print(f"Error processing {rec.file}: {e}") + raise + +def do_process_one(rec): + rec.load() + rec.make_thumb("data/thumb") + +def main(): + samples = list(load_samples("data/raw")) + print(len(samples)) + with mp.Pool(mp.cpu_count() - 1) as pool: + pool.map(process_one, samples) + +if __name__ == "__main__": + main()