themblem/emblem5/scripts/refresh-dataset.py
2025-10-29 21:27:29 +00:00

42 lines
1.2 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import sys
import shutil
import json
import random
def main():
dataset_base = '/data/dataset'
shutil.rmtree(dataset_base + '/pos')
shutil.rmtree(dataset_base + '/neg')
os.makedirs(dataset_base + '/pos', exist_ok=True)
os.makedirs(dataset_base + '/neg', exist_ok=True)
all_samples = os.listdir('data/samples')
random.shuffle(all_samples)
for sample in all_samples[:1000]:
newname = f'{sample}.jpg'
md_name = f'data/samples/{sample}/metadata.json'
pos_or_neg = None
if 'pos' in sample:
pos_or_neg = 'pos'
elif 'neg' in sample:
pos_or_neg = 'neg'
elif os.path.exists(md_name):
with open(md_name, 'r') as f:
md = json.load(f)
if 'pos' in md['labels']:
pos_or_neg = 'pos'
elif 'neg' in md['labels']:
pos_or_neg = 'neg'
if not pos_or_neg:
continue
src = f'data/samples/{sample}/full-sbs.jpg'
if not os.path.exists(src):
continue
print(src)
shutil.copy(src, os.path.join(dataset_base, pos_or_neg, newname))
if __name__ == '__main__':
main()