research: Add process.py

This commit is contained in:
Fam Zheng 2025-02-21 21:37:29 +00:00
parent ff74600819
commit 28dac7da3f

66
research/process.py Executable file
View File

@ -0,0 +1,66 @@
#! /usr/bin/env python3
import json
import os
import base64
from PIL import Image
import multiprocessing as mp
from io import BytesIO
def data_url_to_bin(image_data_url):
f = image_data_url
pref = "data:image/jpeg;base64,"
if f and f.startswith(pref):
return base64.b64decode(f[len(pref):])
pref = "data:image/png;base64,"
if f and f.startswith(pref):
return base64.b64decode(f[len(pref):])
return None
def get_files(d):
for root, dirs, files in os.walk(d):
for file in files:
yield os.path.join(root, file)
class Record:
def __init__(self, file):
self.file = file
self.basename = os.path.basename(self.file)
def load(self):
self.data = json.load(open(self.file))
def make_thumb(self, path):
img = data_url_to_bin(self.data["image_data_url"])
if img:
binary = BytesIO(img)
img = Image.open(binary)
img.thumbnail((100, 100))
img.save(f"{path}/{self.basename}.jpg")
def load_samples(raw_dir):
for file in get_files(raw_dir):
try:
rec = Record(file)
yield rec
except Exception as e:
print(f"Error loading {file}: {e}")
def process_one(rec):
try:
do_process_one(rec)
except Exception as e:
print(f"Error processing {rec.file}: {e}")
raise
def do_process_one(rec):
rec.load()
rec.make_thumb("data/thumb")
def main():
samples = list(load_samples("data/raw"))
print(len(samples))
with mp.Pool(mp.cpu_count() - 1) as pool:
pool.map(process_one, samples)
if __name__ == "__main__":
main()