1703 lines
67 KiB
Python
1703 lines
67 KiB
Python
|
# -----------------------------------------------------------
|
||
|
# "BCAN++: Cross-modal Retrieval With Bidirectional Correct Attention Network"
|
||
|
# Yang Liu, Hong Liu, Huaqiu Wang, Fanyang Meng, Mengyuan Liu*
|
||
|
#
|
||
|
# ---------------------------------------------------------------
|
||
|
"""Evaluation"""
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import argparse
|
||
|
import logging
|
||
|
import os
|
||
|
|
||
|
import sys
|
||
|
import time
|
||
|
import numpy as np
|
||
|
import torch
|
||
|
from collections import OrderedDict
|
||
|
import time
|
||
|
from torch.autograd import Variable
|
||
|
from transformers import BertTokenizer
|
||
|
|
||
|
from data import get_test_loader
|
||
|
from data2 import get_test_loader2
|
||
|
from lib.datasets import image_caption
|
||
|
from lib.evaluation import eval_cxc
|
||
|
from lib.vse import VSEModel
|
||
|
from model import SCAN
|
||
|
from model2 import SCAN2
|
||
|
from model3 import SCAN3
|
||
|
from vocab import deserialize_vocab
|
||
|
|
||
|
|
||
|
class AverageMeter(object):
|
||
|
"""Computes and stores the average and current value"""
|
||
|
def __init__(self):
|
||
|
self.reset()
|
||
|
|
||
|
def reset(self):
|
||
|
self.val = 0
|
||
|
self.avg = 0
|
||
|
self.sum = 0
|
||
|
self.count = 0
|
||
|
|
||
|
def update(self, val, n=0):
|
||
|
self.val = val
|
||
|
self.sum += val * n
|
||
|
self.count += n
|
||
|
self.avg = self.sum / (.0001 + self.count)
|
||
|
|
||
|
def __str__(self):
|
||
|
"""String representation for logging
|
||
|
"""
|
||
|
# for values that should be recorded exactly e.g. iteration number
|
||
|
if self.count == 0:
|
||
|
return str(self.val)
|
||
|
# for stats
|
||
|
return '%.4f (%.4f)' % (self.val, self.avg)
|
||
|
|
||
|
|
||
|
class LogCollector(object):
|
||
|
"""A collection of logging objects that can change from train to val"""
|
||
|
def __init__(self):
|
||
|
# to keep the order of logged variables deterministic
|
||
|
self.meters = OrderedDict()
|
||
|
|
||
|
def update(self, k, v, n=0):
|
||
|
# create a new meter if previously not recorded
|
||
|
if k not in self.meters:
|
||
|
self.meters[k] = AverageMeter()
|
||
|
self.meters[k].update(v, n)
|
||
|
|
||
|
def __str__(self):
|
||
|
"""Concatenate the meters in one log line
|
||
|
"""
|
||
|
s = ''
|
||
|
for i, (k, v) in enumerate(self.meters.items()):
|
||
|
if i > 0:
|
||
|
s += ' '
|
||
|
s += k + ' ' + str(v)
|
||
|
return s
|
||
|
|
||
|
def tb_log(self, tb_logger, prefix='', step=None):
|
||
|
"""Log using tensorboard
|
||
|
"""
|
||
|
for k, v in self.meters.items():
|
||
|
tb_logger.log_value(prefix + k, v.val, step=step)
|
||
|
|
||
|
def encode_data2(model, data_loader, log_step=100, logging=print):
|
||
|
"""Encode all images and captions loadable by `data_loader`
|
||
|
"""
|
||
|
batch_time = AverageMeter()
|
||
|
val_logger = LogCollector()
|
||
|
|
||
|
# switch to evaluate mode
|
||
|
model.eval()
|
||
|
|
||
|
end = time.time()
|
||
|
|
||
|
# np array to keep all the embeddings
|
||
|
img_embs = None
|
||
|
cap_embs = None
|
||
|
cap_lens = None
|
||
|
|
||
|
max_n_word = 0
|
||
|
for i, (images, captions, lengths, ids) in enumerate(data_loader):
|
||
|
max_n_word = max(max_n_word, max(lengths))
|
||
|
# lengths = lengths.cpu().numpy().tolist()
|
||
|
# l = [len(l) for l in lengths]
|
||
|
# max_n_word = max(max_n_word, max(l))
|
||
|
|
||
|
with torch.no_grad():
|
||
|
for i, (images, captions, lengths, ids) in enumerate(data_loader):
|
||
|
# make sure val logger is used
|
||
|
model.logger = val_logger
|
||
|
lengths = lengths.cpu().numpy().tolist()
|
||
|
images = images.cuda()
|
||
|
captions = captions.cuda()
|
||
|
# pos = pos.cuda()
|
||
|
# compute the embeddings
|
||
|
img_emb, img_mean, cap_emb, cap_len, cap_mean = model.module.forward_emb(images, captions, lengths)
|
||
|
# img_emb, cap_emb, cap_len = model.forward_emb(images, captions, pos, lengths)
|
||
|
# print(img_emb)
|
||
|
if img_embs is None:
|
||
|
if img_emb.dim() == 3:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1), img_emb.size(2)))
|
||
|
else:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1)))
|
||
|
cap_embs = np.zeros((len(data_loader.dataset), max_n_word, cap_emb.size(2)))
|
||
|
img_means = np.zeros((len(data_loader.dataset), img_mean.size(1)))
|
||
|
# tags = np.zeros((len(data_loader.dataset), max_n_word))
|
||
|
cap_lens = [0] * len(data_loader.dataset)
|
||
|
cap_means = np.zeros((len(data_loader.dataset), cap_mean.size(1)))
|
||
|
# cache embeddings
|
||
|
# print(img_embs.shape,type(ids))
|
||
|
# print(img_emb.shape)
|
||
|
img_embs[ids] = img_emb.data.cpu().numpy().copy()
|
||
|
img_means[ids] = img_mean.data.cpu().numpy().copy()
|
||
|
cap_means[ids] = cap_mean.data.cpu().numpy().copy()
|
||
|
cap_embs[ids, :cap_emb.size(1), :] = cap_emb.data.cpu().numpy().copy()
|
||
|
for j, nid in enumerate(ids):
|
||
|
cap_lens[nid] = cap_len[j]
|
||
|
|
||
|
# measure elapsed time
|
||
|
batch_time.update(time.time() - end)
|
||
|
end = time.time()
|
||
|
|
||
|
if i % log_step == 0:
|
||
|
logging('Test: [{0}/{1}]\t'
|
||
|
'{e_log}\t'
|
||
|
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
||
|
.format(
|
||
|
i, len(data_loader), batch_time=batch_time,
|
||
|
e_log=str(model.logger)))
|
||
|
del images, captions
|
||
|
return img_embs, img_means, cap_embs, cap_lens, cap_means,
|
||
|
|
||
|
def encode_data(model, data_loader, log_step=100, logging=print):
|
||
|
"""Encode all images and captions loadable by `data_loader`
|
||
|
"""
|
||
|
batch_time = AverageMeter()
|
||
|
val_logger = LogCollector()
|
||
|
|
||
|
# switch to evaluate mode
|
||
|
model.eval()
|
||
|
|
||
|
end = time.time()
|
||
|
|
||
|
# np array to keep all the embeddings
|
||
|
img_embs = None
|
||
|
cap_embs = None
|
||
|
cap_lens = None
|
||
|
|
||
|
max_n_word = 0
|
||
|
for i, (images, img_lengths, captions, lengths, ids) in enumerate(data_loader):
|
||
|
max_n_word = max(max_n_word, max(lengths))
|
||
|
# lengths = lengths.cpu().numpy().tolist()
|
||
|
# l = [len(l) for l in lengths]
|
||
|
# max_n_word = max(max_n_word, max(l))
|
||
|
|
||
|
with torch.no_grad():
|
||
|
for i, (images, img_lengths, captions, lengths, ids) in enumerate(data_loader):
|
||
|
# make sure val logger is used
|
||
|
model.logger = val_logger
|
||
|
lengths = lengths.cpu().numpy().tolist()
|
||
|
images = images.cuda()
|
||
|
img_lengths = img_lengths.cuda()
|
||
|
captions = captions.cuda()
|
||
|
# pos = pos.cuda()
|
||
|
# compute the embeddings
|
||
|
img_emb, img_mean, cap_emb, cap_len, cap_mean = model.module.forward_emb(images, img_lengths, captions, lengths)
|
||
|
# img_emb, cap_emb, cap_len = model.forward_emb(images, captions, pos, lengths)
|
||
|
# print(img_emb)
|
||
|
if img_embs is None:
|
||
|
if img_emb.dim() == 3:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1), img_emb.size(2)))
|
||
|
else:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1)))
|
||
|
cap_embs = np.zeros((len(data_loader.dataset), max_n_word, cap_emb.size(2)))
|
||
|
img_means = np.zeros((len(data_loader.dataset), img_mean.size(1)))
|
||
|
# tags = np.zeros((len(data_loader.dataset), max_n_word))
|
||
|
cap_lens = [0] * len(data_loader.dataset)
|
||
|
cap_means = np.zeros((len(data_loader.dataset), cap_mean.size(1)))
|
||
|
# cache embeddings
|
||
|
# print(img_embs.shape,type(ids))
|
||
|
# print(img_emb.shape)
|
||
|
img_embs[ids] = img_emb.data.cpu().numpy().copy()
|
||
|
img_means[ids] = img_mean.data.cpu().numpy().copy()
|
||
|
cap_means[ids] = cap_mean.data.cpu().numpy().copy()
|
||
|
cap_embs[ids, :cap_emb.size(1), :] = cap_emb.data.cpu().numpy().copy()
|
||
|
for j, nid in enumerate(ids):
|
||
|
cap_lens[nid] = cap_len[j]
|
||
|
|
||
|
# measure elapsed time
|
||
|
batch_time.update(time.time() - end)
|
||
|
end = time.time()
|
||
|
|
||
|
if i % log_step == 0:
|
||
|
logging('Test: [{0}/{1}]\t'
|
||
|
'{e_log}\t'
|
||
|
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
||
|
.format(
|
||
|
i, len(data_loader), batch_time=batch_time,
|
||
|
e_log=str(model.logger)))
|
||
|
del images, captions
|
||
|
return img_embs, img_means, cap_embs, cap_lens, cap_means,
|
||
|
|
||
|
def encode_data_vse(model, data_loader, log_step=10, logging=print, backbone=False):
|
||
|
"""Encode all images and captions loadable by `data_loader`
|
||
|
"""
|
||
|
batch_time = AverageMeter()
|
||
|
val_logger = LogCollector()
|
||
|
|
||
|
# switch to evaluate mode
|
||
|
model.val_start()
|
||
|
|
||
|
end = time.time()
|
||
|
|
||
|
# np array to keep all the embeddings
|
||
|
img_embs = None
|
||
|
cap_embs = None
|
||
|
|
||
|
for i, data_i in enumerate(data_loader):
|
||
|
# make sure val logger is used
|
||
|
if not backbone:
|
||
|
images, image_lengths, captions, lengths, ids = data_i
|
||
|
else:
|
||
|
images, captions, lengths, ids = data_i
|
||
|
model.logger = val_logger
|
||
|
|
||
|
# compute the embeddings
|
||
|
if not backbone:
|
||
|
img_emb, cap_emb = model.forward_emb(images, captions, lengths, image_lengths=image_lengths)
|
||
|
else:
|
||
|
img_emb, cap_emb = model.forward_emb(images, captions, lengths)
|
||
|
|
||
|
if img_embs is None:
|
||
|
if img_emb.dim() == 3:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1), img_emb.size(2)))
|
||
|
else:
|
||
|
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1)))
|
||
|
cap_embs = np.zeros((len(data_loader.dataset), cap_emb.size(1)))
|
||
|
cap_lens = [0] * len(data_loader.dataset)
|
||
|
# cache embeddings
|
||
|
img_embs[ids] = img_emb.data.cpu().numpy().copy()
|
||
|
cap_embs[ids, :] = cap_emb.data.cpu().numpy().copy()
|
||
|
|
||
|
# measure accuracy and record loss
|
||
|
model.forward_loss(img_emb, cap_emb)
|
||
|
|
||
|
# measure elapsed time
|
||
|
batch_time.update(time.time() - end)
|
||
|
end = time.time()
|
||
|
|
||
|
if i % log_step == 0:
|
||
|
logging('Test: [{0}/{1}]\t'
|
||
|
'{e_log}\t'
|
||
|
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
||
|
.format(
|
||
|
i, len(data_loader.dataset) // data_loader.batch_size + 1, batch_time=batch_time,
|
||
|
e_log=str(model.logger)))
|
||
|
|
||
|
del images, captions
|
||
|
return img_embs, cap_embs
|
||
|
|
||
|
def compute_sim(images, captions):
|
||
|
similarities = np.matmul(images, np.matrix.transpose(captions))
|
||
|
return similarities
|
||
|
|
||
|
def evalrank_vse(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
print(opt)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = VSEModel(opt)
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 2, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
|
||
|
logging.info('train')
|
||
|
os.makedirs(opt.logger_name, exist_ok=True)
|
||
|
logger = logging.getLogger(__name__)
|
||
|
model.val_start()
|
||
|
with torch.no_grad():
|
||
|
# compute the encoding for all the validation images and captions
|
||
|
img_embs, cap_embs = encode_data_vse(
|
||
|
model, data_loader, opt.log_step, backbone=opt.precomp_enc_type == 'backbone')
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
|
||
|
start = time.time()
|
||
|
sims = compute_sim(img_embs, cap_embs)
|
||
|
|
||
|
end = time.time()
|
||
|
logger.info("calculate similarity time: {}".format(end - start))
|
||
|
|
||
|
# caption retrieval
|
||
|
npts = img_embs.shape[0]
|
||
|
# (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims)
|
||
|
(r1, r5, r10, medr, meanr) = i2t_vse(npts, sims)
|
||
|
# logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
|
||
|
# (r1, r5, r10, medr, meanr))
|
||
|
# image retrieval
|
||
|
# (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims)
|
||
|
(r1i, r5i, r10i, medri, meanr) = t2i_vse(npts, sims)
|
||
|
# logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
|
||
|
# (r1i, r5i, r10i, medri, meanr))
|
||
|
# sum of recalls to be used for early stopping
|
||
|
currscore = r1 + r5 + r10 + r1i + r5i + r10i
|
||
|
#logger.info('Current rsum is {}'.format(currscore))
|
||
|
print("rsum: %.1f" % currscore)
|
||
|
ar = (r1 + r5 + r10) / 3
|
||
|
ari = (r1i + r5i + r10i) / 3
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" %
|
||
|
(r1, r5, r10, medr, meanr))
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" %
|
||
|
(r1i, r5i, r10i, medri, meanr))
|
||
|
|
||
|
|
||
|
def evalrank_f_c(data_path,dataset):
|
||
|
parser = argparse.ArgumentParser()
|
||
|
#parser.add_argument('--dataset', default='f30k',help='coco or f30k')
|
||
|
#parser.add_argument('--data_path', default='./data/f30k')
|
||
|
#parser.add_argument('--data_path', default='../pycharmProject/data/f30k_precomp')
|
||
|
#parser.add_argument('--data_path', default='../SCAN-master/data/coco_precomp')
|
||
|
#parser.add_argument('--data_path', default='../SCAN-master/data/f30k_precomp')
|
||
|
parser.add_argument('--save_results', action='store_true')
|
||
|
parser.add_argument('--evaluate_cxc', action='store_true')
|
||
|
opt = parser.parse_args()
|
||
|
opt.dataset = dataset
|
||
|
opt.data_path=data_path
|
||
|
|
||
|
if opt.dataset == 'coco':
|
||
|
weights_bases = [
|
||
|
'./runs/coco_model',
|
||
|
]
|
||
|
elif opt.dataset == 'f30k':
|
||
|
weights_bases = [
|
||
|
'./runs/f30k_model',
|
||
|
]
|
||
|
else:
|
||
|
raise ValueError('Invalid dataset argument {}'.format(opt.dataset))
|
||
|
|
||
|
for base in weights_bases:
|
||
|
#model_path = os.path.join(base, 'model_best.pth')
|
||
|
model_path = os.path.join(base, 'checkpoint.pth')
|
||
|
if opt.save_results: # Save the final results for computing ensemble results
|
||
|
save_path = os.path.join(base, 'results_{}.npy'.format(opt.dataset))
|
||
|
else:
|
||
|
save_path = None
|
||
|
|
||
|
if opt.dataset == 'coco':
|
||
|
if not opt.evaluate_cxc:
|
||
|
# Evaluate COCO 5-fold 1K
|
||
|
evalrank_f30k_coco(model_path, data_path=opt.data_path, split='test', fold5=False)
|
||
|
# Evaluate COCO 5K
|
||
|
#evalrank_f30k_coco(model_path, data_path=opt.data_path, split='testall', fold5=False, save_path=save_path)
|
||
|
else:
|
||
|
# Evaluate COCO-trained models on CxC
|
||
|
evalrank_f30k_coco(model_path, data_path=opt.data_path, split='testall', fold5=True, cxc=True)
|
||
|
elif opt.dataset == 'f30k':
|
||
|
# Evaluate Flickr30K
|
||
|
evalrank_f30k_coco(model_path, data_path=opt.data_path, split='test', fold5=False, save_path=save_path)
|
||
|
|
||
|
def evalrank_f_c2(data_path,dataset):
|
||
|
parser = argparse.ArgumentParser()
|
||
|
#parser.add_argument('--dataset', default='f30k',help='coco or f30k')
|
||
|
#parser.add_argument('--data_path', default='./data/f30k')
|
||
|
#parser.add_argument('--data_path', default='../pycharmProject/data/f30k_precomp')
|
||
|
#parser.add_argument('--data_path', default='../SCAN-master/data/coco_precomp')
|
||
|
#parser.add_argument('--data_path', default='../SCAN-master/data/f30k_precomp')
|
||
|
parser.add_argument('--save_results', action='store_true')
|
||
|
parser.add_argument('--evaluate_cxc', action='store_true')
|
||
|
opt = parser.parse_args()
|
||
|
opt.dataset = dataset
|
||
|
opt.data_path=data_path
|
||
|
|
||
|
if opt.dataset == 'coco':
|
||
|
weights_bases = [
|
||
|
'./runs/coco_model',
|
||
|
]
|
||
|
elif opt.dataset == 'f30k':
|
||
|
weights_bases = [
|
||
|
'./runs/f30k_model',
|
||
|
]
|
||
|
else:
|
||
|
raise ValueError('Invalid dataset argument {}'.format(opt.dataset))
|
||
|
|
||
|
for base in weights_bases:
|
||
|
#model_path = os.path.join(base, 'model_best.pth')
|
||
|
model_path = os.path.join(base, 'checkpoint.pth')
|
||
|
if opt.save_results: # Save the final results for computing ensemble results
|
||
|
save_path = os.path.join(base, 'results_{}.npy'.format(opt.dataset))
|
||
|
else:
|
||
|
save_path = None
|
||
|
|
||
|
if opt.dataset == 'coco':
|
||
|
if not opt.evaluate_cxc:
|
||
|
# Evaluate COCO 5-fold 1K
|
||
|
evalrank_f30k_coco2(model_path, data_path=opt.data_path, split='test', fold5=False)
|
||
|
# Evaluate COCO 5K
|
||
|
#evalrank_f30k_coco(model_path, data_path=opt.data_path, split='testall', fold5=False, save_path=save_path)
|
||
|
else:
|
||
|
# Evaluate COCO-trained models on CxC
|
||
|
evalrank_f30k_coco2(model_path, data_path=opt.data_path, split='testall', fold5=True, cxc=True)
|
||
|
elif opt.dataset == 'f30k':
|
||
|
# Evaluate Flickr30K
|
||
|
evalrank_f30k_coco2(model_path, data_path=opt.data_path, split='test', fold5=False, save_path=save_path)
|
||
|
|
||
|
def evalrank_f30k_coco2(model_path, data_path=None, split='dev', fold5=False, save_path=None, cxc=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
|
||
|
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
opt.workers = 5
|
||
|
|
||
|
if not hasattr(opt, 'caption_loss'):
|
||
|
opt.caption_loss = False
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||
|
vocab = tokenizer.vocab
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
opt.backbone_path = '/tmp/data/weights/original_updown_backbone.pth'
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
# construct model
|
||
|
model = VSEModel(opt)
|
||
|
|
||
|
model.make_data_parallel()
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
model.val_start()
|
||
|
|
||
|
# checkpoint2 = torch.load("./runs/bert_adam_bcan_gpo_vseinfty_bcan/model_best.pth.tar")
|
||
|
# # checkpoint2 = torch.load("../pycharmProject/runs/bigru_bcan_adam_mean_base2/model_best.pth.tar")
|
||
|
# opt2 = checkpoint2['opt']
|
||
|
# print(opt)
|
||
|
#
|
||
|
# # load vocabulary used by the model
|
||
|
# vocab = deserialize_vocab(os.path.join(opt2.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
# word2idx = vocab.word2idx
|
||
|
# opt2.vocab_size = len(vocab)
|
||
|
|
||
|
# model2 = SCAN(word2idx, opt2)
|
||
|
# model2 = torch.nn.DataParallel(model2)
|
||
|
# model2.cuda()
|
||
|
|
||
|
|
||
|
# load model state
|
||
|
#model2.load_state_dict(checkpoint2['model'])
|
||
|
|
||
|
# print('Loading dataset')
|
||
|
# opt2.batch_size = 64
|
||
|
# data_loader2 = get_test_loader(split, opt2.data_name, vocab,
|
||
|
# opt2.batch_size, 0, opt2)
|
||
|
|
||
|
# print('Computing results...')
|
||
|
# img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2 = encode_data(model2, data_loader2)
|
||
|
|
||
|
print('Loading dataset')
|
||
|
data_loader = image_caption.get_test_loader(split, opt.data_name, tokenizer,
|
||
|
opt.batch_size, opt.workers, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
with torch.no_grad():
|
||
|
if opt.precomp_enc_type == 'basic':
|
||
|
img_embs, cap_embs = encode_data_vse(model, data_loader)
|
||
|
else:
|
||
|
img_embs, cap_embs = encode_data_vse(model, data_loader, backbone=True)
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if cxc:
|
||
|
eval_cxc(img_embs, cap_embs, data_path)
|
||
|
else:
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
start = time.time()
|
||
|
sims = compute_sim(img_embs, cap_embs)
|
||
|
|
||
|
#img_embs2 = np.array([img_embs2[i] for i in range(0, len(img_embs2), 5)])
|
||
|
# img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
# print(img_embs[:10])
|
||
|
#sims2 = shard_xattn(model2, img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2, opt2, shard_size=500)
|
||
|
# sims = sims + sims2
|
||
|
# print(sims.shape)
|
||
|
npts = img_embs.shape[0]
|
||
|
|
||
|
if save_path is not None:
|
||
|
np.save(save_path, {'npts': npts, 'sims': sims})
|
||
|
print('Save the similarity into {}'.format(save_path))
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time: {}".format(end - start))
|
||
|
|
||
|
r, rt = i2t_vse(npts, sims, return_ranks=True)
|
||
|
ri, rti = t2i_vse(npts, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = compute_sim(img_embs_shard, cap_embs_shard)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time: {}".format(end - start))
|
||
|
|
||
|
npts = img_embs_shard.shape[0]
|
||
|
r, rt0 = i2t_vse(npts, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i_vse(npts, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[12]))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[10])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[11])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
|
||
|
def evalrank_f30k_coco(model_path, data_path=None, split='dev', fold5=False, save_path=None, cxc=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
|
||
|
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
opt.workers = 5
|
||
|
|
||
|
if not hasattr(opt, 'caption_loss'):
|
||
|
opt.caption_loss = False
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||
|
vocab = tokenizer.vocab
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
opt.backbone_path = '/tmp/data/weights/original_updown_backbone.pth'
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
print(opt)
|
||
|
|
||
|
# construct model
|
||
|
model = VSEModel(opt)
|
||
|
|
||
|
model.make_data_parallel()
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
model.val_start()
|
||
|
|
||
|
# checkpoint2 = torch.load("./runs/bert_adam_bcan_gpo_vseinfty_bcan/model_best.pth.tar")
|
||
|
# # checkpoint2 = torch.load("../pycharmProject/runs/bigru_bcan_adam_mean_base2/model_best.pth.tar")
|
||
|
# opt2 = checkpoint2['opt']
|
||
|
# print(opt)
|
||
|
#
|
||
|
# # load vocabulary used by the model
|
||
|
# vocab = deserialize_vocab(os.path.join(opt2.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
# word2idx = vocab.word2idx
|
||
|
# opt2.vocab_size = len(vocab)
|
||
|
|
||
|
# model2 = SCAN(word2idx, opt2)
|
||
|
# model2 = torch.nn.DataParallel(model2)
|
||
|
# model2.cuda()
|
||
|
|
||
|
|
||
|
# load model state
|
||
|
#model2.load_state_dict(checkpoint2['model'])
|
||
|
|
||
|
# print('Loading dataset')
|
||
|
# opt2.batch_size = 64
|
||
|
# data_loader2 = get_test_loader(split, opt2.data_name, vocab,
|
||
|
# opt2.batch_size, 0, opt2)
|
||
|
|
||
|
# print('Computing results...')
|
||
|
# img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2 = encode_data(model2, data_loader2)
|
||
|
|
||
|
print('Loading dataset')
|
||
|
data_loader = image_caption.get_test_loader(split, opt.data_name, tokenizer,
|
||
|
opt.batch_size, opt.workers, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
with torch.no_grad():
|
||
|
if opt.precomp_enc_type == 'basic':
|
||
|
img_embs, cap_embs = encode_data_vse(model, data_loader)
|
||
|
else:
|
||
|
img_embs, cap_embs = encode_data_vse(model, data_loader, backbone=True)
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if cxc:
|
||
|
eval_cxc(img_embs, cap_embs, data_path)
|
||
|
else:
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
start = time.time()
|
||
|
sims = compute_sim(img_embs, cap_embs)
|
||
|
|
||
|
#img_embs2 = np.array([img_embs2[i] for i in range(0, len(img_embs2), 5)])
|
||
|
# img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
# print(img_embs[:10])
|
||
|
#sims2 = shard_xattn(model2, img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2, opt2, shard_size=500)
|
||
|
# sims = sims + sims2
|
||
|
# print(sims.shape)
|
||
|
npts = img_embs.shape[0]
|
||
|
|
||
|
if save_path is not None:
|
||
|
np.save(save_path, {'npts': npts, 'sims': sims})
|
||
|
print('Save the similarity into {}'.format(save_path))
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time: {}".format(end - start))
|
||
|
|
||
|
r, rt = i2t_vse(npts, sims, return_ranks=True)
|
||
|
ri, rti = t2i_vse(npts, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = compute_sim(img_embs_shard, cap_embs_shard)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time: {}".format(end - start))
|
||
|
|
||
|
npts = img_embs_shard.shape[0]
|
||
|
r, rt0 = i2t_vse(npts, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i_vse(npts, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[12]))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[10])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[11])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
def evalrank(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
print(opt)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 2, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data(model, data_loader)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print(img_means.shape)
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
#shuffle_test
|
||
|
#img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
print(img_embs.shape)
|
||
|
#print(img_embs[:10])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
print(sims.shape)
|
||
|
#print(sims[:10])
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# shuffle_test
|
||
|
# r, rt = i2t_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# ri, rti = t2i_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
|
||
|
def evalrank_fanhua(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
print(opt)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, "f30k_precomp", vocab,
|
||
|
opt.batch_size, 2, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data(model, data_loader)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print(img_means.shape)
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
#shuffle_test
|
||
|
#img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
print(img_embs.shape)
|
||
|
#print(img_embs[:10])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
print(sims.shape)
|
||
|
#print(sims[:10])
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# shuffle_test
|
||
|
# r, rt = i2t_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# ri, rti = t2i_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
|
||
|
def evalrank2(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
#checkpoint2 = torch.load("../pycharmProject/runs/bigru_bcan_adam_mean_base2/model_best.pth.tar")
|
||
|
#checkpoint2 = torch.load("../bcan_gpo/runs/bigru_adam_bcan_mean2/model_best.pth.tar")
|
||
|
checkpoint2 = torch.load("../bcan_gpo/runs/bigru_adam_bcan_mean/model_best.pth.tar")
|
||
|
opt = checkpoint['opt']
|
||
|
opt2 = checkpoint2['opt']
|
||
|
print(opt)
|
||
|
print(opt2)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
model2 = SCAN2(word2idx, opt2)
|
||
|
model2 = torch.nn.DataParallel(model2)
|
||
|
model2.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
model2.load_state_dict(checkpoint2['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
data_loader2 = get_test_loader2(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data(model, data_loader)
|
||
|
img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2 = encode_data2(model2, data_loader2)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print(img_means.shape)
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
img_embs2 = np.array([img_embs2[i] for i in range(0, len(img_embs2), 5)])
|
||
|
#shuffle_test
|
||
|
#img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
print(img_embs.shape)
|
||
|
#print(img_embs[:10])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
sims2 = shard_xattn(model2, img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2, opt2, shard_size=500)
|
||
|
sims = sims + sims2
|
||
|
print(sims.shape)
|
||
|
#print(sims[:10])
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# shuffle_test
|
||
|
# r, rt = i2t_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# ri, rti = t2i_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
|
||
|
def evalrank_avgpool(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
#checkpoint2 = torch.load("../pycharmProject/runs/bigru_bcan_adam_mean_base2/model_best.pth.tar")
|
||
|
checkpoint2 = torch.load("../bcan_gpo/runs/bigru_adam_bcan_mean2/model_best.pth.tar")
|
||
|
opt = checkpoint['opt']
|
||
|
opt2 = checkpoint2['opt']
|
||
|
print(opt)
|
||
|
print(opt2)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
model2 = SCAN2(word2idx, opt2)
|
||
|
model2 = torch.nn.DataParallel(model2)
|
||
|
model2.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
model2.load_state_dict(checkpoint2['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
data_loader2 = get_test_loader2(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data(model, data_loader)
|
||
|
img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2 = encode_data2(model2, data_loader2)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print(img_means.shape)
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
img_embs2 = np.array([img_embs2[i] for i in range(0, len(img_embs2), 5)])
|
||
|
#shuffle_test
|
||
|
#img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
print(img_embs.shape)
|
||
|
#print(img_embs[:10])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
sims2 = shard_xattn(model2, img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2, opt2, shard_size=500)
|
||
|
sims = sims + sims2
|
||
|
print(sims.shape)
|
||
|
#print(sims[:10])
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# shuffle_test
|
||
|
# r, rt = i2t_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# ri, rti = t2i_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
def evalrank_maxpool(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
#checkpoint2 = torch.load("../pycharmProject/runs/bigru_bcan_adam_mean_base2/model_best.pth.tar")
|
||
|
checkpoint2 = torch.load("../bcan_gpo/runs/bigru_bcan_adam_max_36/model_best.pth.tar")
|
||
|
opt = checkpoint['opt']
|
||
|
opt2 = checkpoint2['opt']
|
||
|
print(opt)
|
||
|
print(opt2)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
model2 = SCAN3(word2idx, opt2)
|
||
|
model2 = torch.nn.DataParallel(model2)
|
||
|
model2.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
model2.load_state_dict(checkpoint2['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
opt.batch_size = 64
|
||
|
data_loader = get_test_loader(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
data_loader2 = get_test_loader2(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data(model, data_loader)
|
||
|
img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2 = encode_data2(model2, data_loader2)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print(img_means.shape)
|
||
|
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
img_embs2 = np.array([img_embs2[i] for i in range(0, len(img_embs2), 5)])
|
||
|
#shuffle_test
|
||
|
#img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 20)])
|
||
|
print(img_embs.shape)
|
||
|
#print(img_embs[:10])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
sims2 = shard_xattn(model2, img_embs2, img_means2, cap_embs2, cap_lens2, cap_means2, opt2, shard_size=500)
|
||
|
sims = sims + sims2
|
||
|
print(sims.shape)
|
||
|
#print(sims[:10])
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# shuffle_test
|
||
|
# r, rt = i2t_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
# ri, rti = t2i_shuffle(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
|
||
|
def evalrank3(model_path, data_path=None, split='dev', fold5=False):
|
||
|
"""
|
||
|
Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
|
||
|
cross-validation is done (only for MSCOCO). Otherwise, the full data is
|
||
|
used for evaluation.
|
||
|
"""
|
||
|
# load model and options
|
||
|
checkpoint = torch.load(model_path)
|
||
|
opt = checkpoint['opt']
|
||
|
print(opt)
|
||
|
if data_path is not None:
|
||
|
opt.data_path = data_path
|
||
|
|
||
|
|
||
|
# load vocabulary used by the model
|
||
|
vocab = deserialize_vocab(os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
|
||
|
word2idx = vocab.word2idx
|
||
|
opt.vocab_size = len(vocab)
|
||
|
|
||
|
model = SCAN2(word2idx, opt)
|
||
|
model = torch.nn.DataParallel(model)
|
||
|
model.cuda()
|
||
|
|
||
|
# load model state
|
||
|
model.load_state_dict(checkpoint['model'])
|
||
|
|
||
|
print('Loading dataset')
|
||
|
data_loader = get_test_loader2(split, opt.data_name, vocab,
|
||
|
opt.batch_size, 0, opt)
|
||
|
|
||
|
print('Computing results...')
|
||
|
img_embs, img_means, cap_embs, cap_lens, cap_means= encode_data2(model, data_loader)
|
||
|
print(img_embs.shape, cap_embs.shape)
|
||
|
print('Images: %d, Captions: %d' %
|
||
|
(img_embs.shape[0] / 5, cap_embs.shape[0]))
|
||
|
|
||
|
if not fold5:
|
||
|
# no cross-validation, full evaluation
|
||
|
img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
|
||
|
start = time.time()
|
||
|
|
||
|
sims = shard_xattn(model, img_embs, img_means, cap_embs, cap_lens, cap_means,opt, shard_size=500)
|
||
|
print(sims.shape)
|
||
|
# np.save('f30k_dev', sims)
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f" % rsum)
|
||
|
print("Average i2t Recall: %.1f" % ar)
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
|
||
|
print("Average t2i Recall: %.1f" % ari)
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
|
||
|
for ele in sims[:10]:
|
||
|
inds = np.argsort(ele)[::-1]
|
||
|
print(inds[:10])
|
||
|
inds = np.argsort(sims[1])[::-1]
|
||
|
print(inds[:10])
|
||
|
else:
|
||
|
# 5fold cross-validation, only for MSCOCO
|
||
|
results = []
|
||
|
for i in range(5):
|
||
|
img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
|
||
|
img_means_shard = img_means[i * 5000:(i + 1) * 5000:5]
|
||
|
cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
|
||
|
cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
|
||
|
cap_means_shard = cap_means[i * 5000:(i + 1) * 5000]
|
||
|
start = time.time()
|
||
|
sims = shard_xattn(model, img_embs_shard, img_means_shard, cap_embs_shard, cap_lens_shard, cap_means_shard, opt, shard_size=128)
|
||
|
|
||
|
end = time.time()
|
||
|
print("calculate similarity time:", end - start)
|
||
|
|
||
|
r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
|
||
|
ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True)
|
||
|
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
|
||
|
|
||
|
if i == 0:
|
||
|
rt, rti = rt0, rti0
|
||
|
ar = (r[0] + r[1] + r[2]) / 3
|
||
|
ari = (ri[0] + ri[1] + ri[2]) / 3
|
||
|
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
|
||
|
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
|
||
|
results += [list(r) + list(ri) + [ar, ari, rsum]]
|
||
|
|
||
|
print("-----------------------------------")
|
||
|
print("Mean metrics: ")
|
||
|
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
|
||
|
print("rsum: %.1f" % (mean_metrics[10] * 6))
|
||
|
print("Average i2t Recall: %.1f" % mean_metrics[11])
|
||
|
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[:5])
|
||
|
print("Average t2i Recall: %.1f" % mean_metrics[12])
|
||
|
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
|
||
|
mean_metrics[5:10])
|
||
|
|
||
|
torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
|
||
|
|
||
|
def shard_xattn(model, images, img_means, captions, caplens, cap_means, opt, shard_size=128):
|
||
|
"""
|
||
|
Computer pairwise t2i image-caption distance with locality sharding
|
||
|
"""
|
||
|
n_im_shard = (len(images) - 1) // shard_size + 1
|
||
|
n_cap_shard = (len(captions) - 1) // shard_size + 1
|
||
|
|
||
|
d = np.zeros((len(images), len(captions)))
|
||
|
for i in range(n_im_shard):
|
||
|
im_start, im_end = shard_size * i, min(shard_size * (i + 1), len(images))
|
||
|
im = Variable(torch.from_numpy(images[im_start:im_end]), volatile=True).float().cuda()
|
||
|
im_m = Variable(torch.from_numpy(img_means[im_start:im_end]), volatile=True).float().cuda()
|
||
|
for j in range(n_cap_shard):
|
||
|
sys.stdout.write('\r>> shard_xattn_t2i batch (%d,%d)' % (i, j))
|
||
|
cap_start, cap_end = shard_size * j, min(shard_size * (j + 1), len(captions))
|
||
|
# im = Variable(torch.from_numpy(images[im_start:im_end]), volatile=True).float().cuda()
|
||
|
# im_m = Variable(torch.from_numpy(img_means[im_start:im_end]), volatile=True).float().cuda()
|
||
|
s_m = Variable(torch.from_numpy(cap_means[cap_start:cap_end]), volatile=True).float().cuda()
|
||
|
s = Variable(torch.from_numpy(captions[cap_start:cap_end]), volatile=True).float().cuda()
|
||
|
l = caplens[cap_start:cap_end]
|
||
|
with torch.no_grad():
|
||
|
sim = model.module.forward_sim(im, im_m, s, l, s_m)
|
||
|
# sim = model.xattn_score_t2i2(im, s, l)
|
||
|
d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()
|
||
|
sys.stdout.write('\n')
|
||
|
return d
|
||
|
|
||
|
|
||
|
def i2t(images, captions, caplens, sims, npts=None, return_ranks=False):
|
||
|
"""
|
||
|
Images->Text (Image Annotation)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
npts = images.shape[0]
|
||
|
ranks = np.zeros(npts)
|
||
|
top1 = np.zeros(npts)
|
||
|
for index in range(npts):
|
||
|
inds = np.argsort(sims[index])[::-1]
|
||
|
# Score
|
||
|
rank = 1e20
|
||
|
for i in range(5 * index, 5 * index + 5, 1):
|
||
|
# print(inds, i, index, npts)
|
||
|
tmp = np.where(inds == i)[0][0]
|
||
|
if tmp < rank:
|
||
|
rank = tmp
|
||
|
ranks[index] = rank
|
||
|
top1[index] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|
||
|
|
||
|
|
||
|
def t2i(images, captions, caplens, sims, npts=None, return_ranks=False):
|
||
|
"""
|
||
|
Text->Images (Image Search)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
npts = images.shape[0]
|
||
|
ranks = np.zeros(5 * npts)
|
||
|
top1 = np.zeros(5 * npts)
|
||
|
|
||
|
# --> (5N(caption), N(image))
|
||
|
sims = sims.T
|
||
|
|
||
|
for index in range(npts):
|
||
|
for i in range(5):
|
||
|
inds = np.argsort(sims[5 * index + i])[::-1]
|
||
|
ranks[5 * index + i] = np.where(inds == index)[0][0]
|
||
|
top1[5 * index + i] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|
||
|
|
||
|
def i2t_shuffle(images, captions, caplens, sims, npts=None, return_ranks=False):
|
||
|
"""
|
||
|
Images->Text (Image Annotation)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
npts = images.shape[0]
|
||
|
ranks = np.zeros(npts)
|
||
|
top1 = np.zeros(npts)
|
||
|
for index in range(npts):
|
||
|
inds = np.argsort(sims[index])[::-1]
|
||
|
# Score
|
||
|
rank = 1e20
|
||
|
for i in range(20 * index, 20 * index + 20, 1):
|
||
|
# print(inds, i, index, npts)
|
||
|
tmp = np.where(inds == i)[0][0]
|
||
|
if tmp < rank:
|
||
|
rank = tmp
|
||
|
ranks[index] = rank
|
||
|
top1[index] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|
||
|
|
||
|
|
||
|
def t2i_shuffle(images, captions, caplens, sims, npts=None, return_ranks=False):
|
||
|
"""
|
||
|
Text->Images (Image Search)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
npts = images.shape[0]
|
||
|
ranks = np.zeros(20 * npts)
|
||
|
top1 = np.zeros(20 * npts)
|
||
|
|
||
|
# --> (5N(caption), N(image))
|
||
|
sims = sims.T
|
||
|
|
||
|
for index in range(npts):
|
||
|
for i in range(20):
|
||
|
inds = np.argsort(sims[20 * index + i])[::-1]
|
||
|
ranks[20 * index + i] = np.where(inds == index)[0][0]
|
||
|
top1[20 * index + i] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|
||
|
|
||
|
def i2t_vse(npts, sims, return_ranks=False, mode='coco'):
|
||
|
"""
|
||
|
Images->Text (Image Annotation)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
ranks = np.zeros(npts)
|
||
|
top1 = np.zeros(npts)
|
||
|
for index in range(npts):
|
||
|
inds = np.argsort(sims[index])[::-1]
|
||
|
if mode == 'coco':
|
||
|
rank = 1e20
|
||
|
for i in range(5 * index, 5 * index + 5, 1):
|
||
|
tmp = np.where(inds == i)[0][0]
|
||
|
if tmp < rank:
|
||
|
rank = tmp
|
||
|
ranks[index] = rank
|
||
|
top1[index] = inds[0]
|
||
|
else:
|
||
|
rank = np.where(inds == index)[0][0]
|
||
|
ranks[index] = rank
|
||
|
top1[index] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|
||
|
|
||
|
|
||
|
def t2i_vse(npts, sims, return_ranks=False, mode='coco'):
|
||
|
"""
|
||
|
Text->Images (Image Search)
|
||
|
Images: (N, n_region, d) matrix of images
|
||
|
Captions: (5N, max_n_word, d) matrix of captions
|
||
|
CapLens: (5N) array of caption lengths
|
||
|
sims: (N, 5N) matrix of similarity im-cap
|
||
|
"""
|
||
|
# npts = images.shape[0]
|
||
|
|
||
|
if mode == 'coco':
|
||
|
ranks = np.zeros(5 * npts)
|
||
|
top1 = np.zeros(5 * npts)
|
||
|
else:
|
||
|
ranks = np.zeros(npts)
|
||
|
top1 = np.zeros(npts)
|
||
|
|
||
|
# --> (5N(caption), N(image))
|
||
|
sims = sims.T
|
||
|
|
||
|
for index in range(npts):
|
||
|
if mode == 'coco':
|
||
|
for i in range(5):
|
||
|
inds = np.argsort(sims[5 * index + i])[::-1]
|
||
|
ranks[5 * index + i] = np.where(inds == index)[0][0]
|
||
|
top1[5 * index + i] = inds[0]
|
||
|
else:
|
||
|
inds = np.argsort(sims[index])[::-1]
|
||
|
ranks[index] = np.where(inds == index)[0][0]
|
||
|
top1[index] = inds[0]
|
||
|
|
||
|
# Compute metrics
|
||
|
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
|
||
|
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
|
||
|
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
|
||
|
medr = np.floor(np.median(ranks)) + 1
|
||
|
meanr = ranks.mean() + 1
|
||
|
if return_ranks:
|
||
|
return (r1, r5, r10, medr, meanr), (ranks, top1)
|
||
|
else:
|
||
|
return (r1, r5, r10, medr, meanr)
|