research: Add process.py
This commit is contained in:
parent
ff74600819
commit
28dac7da3f
66
research/process.py
Executable file
66
research/process.py
Executable file
@ -0,0 +1,66 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
import multiprocessing as mp
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
def data_url_to_bin(image_data_url):
|
||||||
|
f = image_data_url
|
||||||
|
pref = "data:image/jpeg;base64,"
|
||||||
|
if f and f.startswith(pref):
|
||||||
|
return base64.b64decode(f[len(pref):])
|
||||||
|
pref = "data:image/png;base64,"
|
||||||
|
if f and f.startswith(pref):
|
||||||
|
return base64.b64decode(f[len(pref):])
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_files(d):
|
||||||
|
for root, dirs, files in os.walk(d):
|
||||||
|
for file in files:
|
||||||
|
yield os.path.join(root, file)
|
||||||
|
|
||||||
|
class Record:
|
||||||
|
def __init__(self, file):
|
||||||
|
self.file = file
|
||||||
|
self.basename = os.path.basename(self.file)
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
self.data = json.load(open(self.file))
|
||||||
|
|
||||||
|
def make_thumb(self, path):
|
||||||
|
img = data_url_to_bin(self.data["image_data_url"])
|
||||||
|
if img:
|
||||||
|
binary = BytesIO(img)
|
||||||
|
img = Image.open(binary)
|
||||||
|
img.thumbnail((100, 100))
|
||||||
|
img.save(f"{path}/{self.basename}.jpg")
|
||||||
|
|
||||||
|
def load_samples(raw_dir):
|
||||||
|
for file in get_files(raw_dir):
|
||||||
|
try:
|
||||||
|
rec = Record(file)
|
||||||
|
yield rec
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading {file}: {e}")
|
||||||
|
|
||||||
|
def process_one(rec):
|
||||||
|
try:
|
||||||
|
do_process_one(rec)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {rec.file}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def do_process_one(rec):
|
||||||
|
rec.load()
|
||||||
|
rec.make_thumb("data/thumb")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
samples = list(load_samples("data/raw"))
|
||||||
|
print(len(samples))
|
||||||
|
with mp.Pool(mp.cpu_count() - 1) as pool:
|
||||||
|
pool.map(process_one, samples)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user