Graduation_Project/QN/RecipeRetrieval/preprocess/create_dataset_json.py

142 lines
3.8 KiB
Python

import json
from tqdm import tqdm
import os
from collections import defaultdict
from glob import glob
import random
def create_sbu_json(annot_path, output_dir):
os.makedirs(output_dir, exist_ok=True)
data = json.load(open(annot_path,'r'))
new_data = []
for d in tqdm(data):
new_dict = {'image': d[0].replace('images', 'images_train'), 'caption': d[1]}
new_data.append(new_dict)
out_path = os.path.join(output_dir, 'sbu.json')
with open(out_path, 'w') as file:
json.dump(new_data, file)
def to_dict_coco(path, iid2captions, iid2split, iid2id):
name = path.split("/")[-1]
captions = iid2captions[name]
split = iid2split[name]
id_ = iid2id[name]
di = []
for c in captions:
di.append({'image': path, 'caption':c, 'image_id': id_})
return split, di
def create_coco_json(data_dir, output_dir, split=['train', 'val'], output_file='coco.json'):
with open(f"{data_dir}/karpathy/dataset_coco.json", "r") as fp:
captions = json.load(fp)
captions = captions["images"]
iid2captions = defaultdict(list)
iid2split = dict()
iid2id = dict()
for cap in tqdm(captions):
filename = cap["filename"]
iid2id[filename] = cap['cocoid']
iid2split[filename] = cap["split"]
for c in cap["sentences"]:
iid2captions[filename].append(c["raw"])
paths = list(glob(f"{data_dir}/train2014/*.jpg")) + list(glob(f"{data_dir}/val2014/*.jpg"))
random.shuffle(paths)
caption_paths = [path for path in tqdm(paths) if path.split("/")[-1] in iid2captions]
if len(paths) == len(caption_paths):
print("all images have caption annotations")
else:
print("not all images have caption annotations")
print(
len(paths), len(caption_paths), len(iid2captions),
)
new_data = []
num=0
other_splits = set()
for path in tqdm(caption_paths):
s, di = to_dict_coco(path, iid2captions, iid2split, iid2id)
if s in split:
num+=1
for d in di:
new_data.append(d)
else:
other_splits.add(s)
print(split, num, 'images', ', other_splits:', other_splits)
out_path = os.path.join(output_dir, output_file)
with open(out_path, 'w') as file:
json.dump(new_data, file)
def to_dict_vg(path, iid2captions):
name = path.split("/")[-1]
iid = int(name[:-4])
cdicts = iid2captions[iid]
captions = [c["phrase"] for c in cdicts]
di = []
for c in captions:
di.append({'image': path, 'caption':c})
return di
def create_vg_json(data_dir, output_dir, split=['train', 'val']):
with open(f"{data_dir}/annotations/region_descriptions.json", "r") as fp:
captions = json.load(fp)
iid2captions = defaultdict(list)
for cap in tqdm(captions):
cap = cap["regions"]
for c in cap:
iid2captions[c["image_id"]].append(c)
paths = list(glob(f"{data_dir}/images/VG_100K/*.jpg")) + list(
glob(f"{data_dir}/images/VG_100K_2/*.jpg")
)
random.shuffle(paths)
caption_paths = [
path for path in paths if int(path.split("/")[-1][:-4]) in iid2captions
]
if len(paths) == len(caption_paths):
print("all images have caption annotations")
else:
print("not all images have caption annotations")
print(
len(paths), len(caption_paths), len(iid2captions),
)
new_data = []
for path in tqdm(caption_paths):
di = to_dict_vg(path, iid2captions)
for d in di:
new_data.append(d)
out_path = os.path.join(output_dir, 'vg.json')
with open(out_path, 'w') as file:
json.dump(new_data, file)