85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
|
# -----------------------------------------------------------
|
||
|
# Stacked Cross Attention Network implementation based on
|
||
|
# https://arxiv.org/abs/1803.08024.
|
||
|
# "Stacked Cross Attention for Image-Text Matching"
|
||
|
# Kuang-Huei Lee, Xi Chen, Gang Hua, Houdong Hu, Xiaodong He
|
||
|
#
|
||
|
# Writen by Kuang-Huei Lee, 2018
|
||
|
# ---------------------------------------------------------------
|
||
|
"""Convert image features from bottom up attention to numpy array"""
|
||
|
import os
|
||
|
import base64
|
||
|
import csv
|
||
|
import sys
|
||
|
import zlib
|
||
|
import json
|
||
|
import argparse
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument('--imgid_list', default='../data/coco_precomp/train_ids.txt',
|
||
|
help='Path to list of image id')
|
||
|
parser.add_argument('--input_file', default=['../data/bu_data/trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0'],
|
||
|
# ,'../data/bu_data/trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.1'],
|
||
|
help='tsv of all image data (output of bottom-up-attention/tools/generate_tsv.py), \
|
||
|
where each columns are: [image_id, image_w, image_h, num_boxes, boxes, features].')
|
||
|
parser.add_argument('--output_dir', default='../data/coco_precomp/',
|
||
|
help='Output directory.')
|
||
|
parser.add_argument('--split', default='train',
|
||
|
help='train|dev|test')
|
||
|
opt = parser.parse_args()
|
||
|
print(opt)
|
||
|
|
||
|
|
||
|
meta = []
|
||
|
feature = {}
|
||
|
for line in open(opt.imgid_list):
|
||
|
sid = int(line.strip())
|
||
|
meta.append(sid)
|
||
|
feature[sid] = None
|
||
|
|
||
|
maxInt = sys.maxsize
|
||
|
|
||
|
while True:
|
||
|
# decrease the maxInt value by factor 10
|
||
|
# as long as the OverflowError occurs.
|
||
|
|
||
|
try:
|
||
|
csv.field_size_limit(maxInt)
|
||
|
break
|
||
|
except OverflowError:
|
||
|
maxInt = int(maxInt/10)
|
||
|
|
||
|
FIELDNAMES = ['image_id', 'image_w', 'image_h', 'num_boxes', 'boxes', 'features']
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
for input_file in opt.input_file:
|
||
|
with open(input_file, "r+t") as tsv_in_file:
|
||
|
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
|
||
|
for item in reader:
|
||
|
item['image_id'] = int(item['image_id'])
|
||
|
item['image_h'] = int(item['image_h'])
|
||
|
item['image_w'] = int(item['image_w'])
|
||
|
item['num_boxes'] = int(item['num_boxes'])
|
||
|
for field in ['boxes', 'features']:
|
||
|
item[field] = np.frombuffer(base64.decodestring(item[field].encode()),
|
||
|
dtype=np.float32).reshape((item['num_boxes'],-1))
|
||
|
if item['image_id'] in feature:
|
||
|
feature[item['image_id']] = item['features']
|
||
|
|
||
|
# Padding
|
||
|
data_out = []
|
||
|
for sid in meta:
|
||
|
padding_data = np.zeros((36,2048))
|
||
|
region_num = np.shape(feature[sid])[0]
|
||
|
if region_num <= 36:
|
||
|
padding_data[:region_num, :] = feature[sid]
|
||
|
else:
|
||
|
padding_data = feature[sid][:36, :]
|
||
|
data_out.append(padding_data)
|
||
|
|
||
|
data_out = np.stack(data_out, axis=0)
|
||
|
print("Final numpy array shape:", data_out.shape)
|
||
|
np.save(os.path.join(opt.output_dir, '{}_ims.npy'.format(opt.split)), data_out)
|