diff --git a/eval.py b/eval.py index 3e9359a09..27a2d0d3f 100644 --- a/eval.py +++ b/eval.py @@ -7,6 +7,11 @@ from nltk.translate.bleu_score import corpus_bleu import torch.nn.functional as F from tqdm import tqdm +#socre evaluation packages from COCO.api +from evalfunc.bleu.bleu import Bleu +from evalfunc.rouge.rouge import Rouge +from evalfunc.cider.cider import Cider +from evalfunc.meteor.meteor import Meteor # Parameters data_folder = '/media/ssd/caption data' # folder with data files saved by create_input_files.py @@ -168,12 +173,31 @@ def evaluate(beam_size): assert len(references) == len(hypotheses) - # Calculate BLEU-4 scores - bleu4 = corpus_bleu(references, hypotheses) - - return bleu4 + # Calculate BLEU & CIDEr & METEOR & ROUGE scores + scorers = [ + (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), + (Cider(), "CIDEr"), + (Meteor(), "METEOR"), + (Rouge(), "ROUGE_L") + ] + + hypo = [[' '.join(hypo)] for hypo in [[str(x) for x in hypo] for hypo in hypotheses]] + ref = [[' '.join(reft) for reft in reftmp] for reftmp in [[[str(x) for x in reft] for reft in reftmp]for reftmp in references]] + + score = [] + method = [] + for scorer, method_i in scorers: + score_i, scores_i = scorer.compute_score(ref, hypo) + score.extend(score_i) if isinstance(score_i, list) else score.append(score_i) + method.extend(method_i) if isinstance(method_i, list) else method.append(method_i) + score_dict = dict(zip(method, score)) + + return score_dict if __name__ == '__main__': beam_size = 1 - print("\nBLEU-4 score @ beam size of %d is %.4f." % (beam_size, evaluate(beam_size))) + score_dict = evaluate(beam_size) + for method, score in score_dict.items(): + print('%s: %.4f' % (method, score)) + print("\n%s score @ beam size of %d is %.4f." % (method, beam_size, score)) diff --git a/evalfunc/bleu/LICENSE b/evalfunc/bleu/LICENSE new file mode 100644 index 000000000..9ccf67790 --- /dev/null +++ b/evalfunc/bleu/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/evalfunc/bleu/__init__.py b/evalfunc/bleu/__init__.py new file mode 100644 index 000000000..3f7d85bba --- /dev/null +++ b/evalfunc/bleu/__init__.py @@ -0,0 +1 @@ +__author__ = 'tylin' diff --git a/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc b/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 000000000..4d926fd03 Binary files /dev/null and b/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc differ diff --git a/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc b/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc new file mode 100644 index 000000000..37fda478b Binary files /dev/null and b/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc differ diff --git a/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc b/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc new file mode 100644 index 000000000..7dd40009f Binary files /dev/null and b/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc differ diff --git a/evalfunc/bleu/bleu.py b/evalfunc/bleu/bleu.py new file mode 100644 index 000000000..c1bd0b4c6 --- /dev/null +++ b/evalfunc/bleu/bleu.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# +# File Name : bleu.py +# +# Description : Wrapper for BLEU scorer. +# +# Creation Date : 06-01-2015 +# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT +# Authors : Hao Fang and Tsung-Yi Lin + +from .bleu_scorer import BleuScorer + + +class Bleu: + def __init__(self, n=4): + # default compute Blue score up to 4 + self._n = n + self._hypo_for_image = {} + self.ref_for_image = {} + + def compute_score(self, gts, res): + + bleu_scorer = BleuScorer(n=self._n) + for i in range(len(res)): + hypo = res[i] + ref = gts[i] + + # Sanity check. + assert(type(hypo) is list) + assert(len(hypo) == 1) + assert(type(ref) is list) + assert(len(ref) >= 1) + + bleu_scorer += (hypo[0], ref) + + #score, scores = bleu_scorer.compute_score(option='shortest') + score, scores = bleu_scorer.compute_score(option='closest', verbose=1) + #score, scores = bleu_scorer.compute_score(option='average', verbose=1) + + # return (bleu, bleu_info) + return score, scores + + def method(self): + return "Bleu" diff --git a/evalfunc/bleu/bleu_scorer.py b/evalfunc/bleu/bleu_scorer.py new file mode 100644 index 000000000..ec8220847 --- /dev/null +++ b/evalfunc/bleu/bleu_scorer.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python + +# bleu_scorer.py +# David Chiang + +# Copyright (c) 2004-2006 University of Maryland. All rights +# reserved. Do not redistribute without permission from the +# author. Not for commercial use. + +# Modified by: +# Hao Fang +# Tsung-Yi Lin + +'''Provides: +cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). +cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). +''' + +import copy +import sys, math, re +from collections import defaultdict + +def precook(s, n=4, out=False): + """Takes a string as input and returns an object that can be given to + either cook_refs or cook_test. This is optional: cook_refs and cook_test + can take string arguments as well.""" + words = s.split() + counts = defaultdict(int) + for k in range(1,n+1): + for i in range(len(words)-k+1): + ngram = tuple(words[i:i+k]) + counts[ngram] += 1 + return (len(words), counts) + +def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" + '''Takes a list of reference sentences for a single segment + and returns an object that encapsulates everything that BLEU + needs to know about them.''' + + reflen = [] + maxcounts = {} + for ref in refs: + rl, counts = precook(ref, n) + reflen.append(rl) + for (ngram,count) in counts.items(): + maxcounts[ngram] = max(maxcounts.get(ngram,0), count) + + # Calculate effective reference sentence length. + if eff == "shortest": + reflen = min(reflen) + elif eff == "average": + reflen = float(sum(reflen))/len(reflen) + + ## lhuang: N.B.: leave reflen computaiton to the very end!! + + ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) + + return (reflen, maxcounts) + +def cook_test(test, xxx_todo_changeme, eff=None, n=4): + '''Takes a test sentence and returns an object that + encapsulates everything that BLEU needs to know about it.''' + (reflen, refmaxcounts) = xxx_todo_changeme + testlen, counts = precook(test, n, True) + + result = {} + + # Calculate effective reference sentence length. + + if eff == "closest": + result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] + else: ## i.e., "average" or "shortest" or None + result["reflen"] = reflen + + result["testlen"] = testlen + + result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] + + result['correct'] = [0]*n + for (ngram, count) in counts.items(): + result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) + + return result + +class BleuScorer(object): + """Bleu scorer. + """ + + __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" + # special_reflen is used in oracle (proportional effective ref len for a node). + + def copy(self): + ''' copy the refs.''' + new = BleuScorer(n=self.n) + new.ctest = copy.copy(self.ctest) + new.crefs = copy.copy(self.crefs) + new._score = None + return new + + def __init__(self, test=None, refs=None, n=4, special_reflen=None): + ''' singular instance ''' + + self.n = n + self.crefs = [] + self.ctest = [] + self.cook_append(test, refs) + self.special_reflen = special_reflen + + def cook_append(self, test, refs): + '''called by constructor and __iadd__ to avoid creating new instances.''' + + if refs is not None: + self.crefs.append(cook_refs(refs)) + if test is not None: + cooked_test = cook_test(test, self.crefs[-1]) + self.ctest.append(cooked_test) ## N.B.: -1 + else: + self.ctest.append(None) # lens of crefs and ctest have to match + + self._score = None ## need to recompute + + def ratio(self, option=None): + self.compute_score(option=option) + return self._ratio + + def score_ratio(self, option=None): + '''return (bleu, len_ratio) pair''' + return (self.fscore(option=option), self.ratio(option=option)) + + def score_ratio_str(self, option=None): + return "%.4f (%.2f)" % self.score_ratio(option) + + def reflen(self, option=None): + self.compute_score(option=option) + return self._reflen + + def testlen(self, option=None): + self.compute_score(option=option) + return self._testlen + + def retest(self, new_test): + if type(new_test) is str: + new_test = [new_test] + assert len(new_test) == len(self.crefs), new_test + self.ctest = [] + for t, rs in zip(new_test, self.crefs): + self.ctest.append(cook_test(t, rs)) + self._score = None + + return self + + def rescore(self, new_test): + ''' replace test(s) with new test(s), and returns the new score.''' + + return self.retest(new_test).compute_score() + + def size(self): + assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) + return len(self.crefs) + + def __iadd__(self, other): + '''add an instance (e.g., from another sentence).''' + + if type(other) is tuple: + ## avoid creating new BleuScorer instances + self.cook_append(other[0], other[1]) + else: + assert self.compatible(other), "incompatible BLEUs." + self.ctest.extend(other.ctest) + self.crefs.extend(other.crefs) + self._score = None ## need to recompute + + return self + + def compatible(self, other): + return isinstance(other, BleuScorer) and self.n == other.n + + def single_reflen(self, option="average"): + return self._single_reflen(self.crefs[0][0], option) + + def _single_reflen(self, reflens, option=None, testlen=None): + + if option == "shortest": + reflen = min(reflens) + elif option == "average": + reflen = float(sum(reflens))/len(reflens) + elif option == "closest": + reflen = min((abs(l-testlen), l) for l in reflens)[1] + else: + assert False, "unsupported reflen option %s" % option + + return reflen + + def recompute_score(self, option=None, verbose=0): + self._score = None + return self.compute_score(option, verbose) + + def compute_score(self, option=None, verbose=0): + n = self.n + small = 1e-9 + tiny = 1e-15 ## so that if guess is 0 still return 0 + bleu_list = [[] for _ in range(n)] + + if self._score is not None: + return self._score + + if option is None: + option = "average" if len(self.crefs) == 1 else "closest" + + self._testlen = 0 + self._reflen = 0 + totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} + + # for each sentence + for comps in self.ctest: + testlen = comps['testlen'] + self._testlen += testlen + + if self.special_reflen is None: ## need computation + reflen = self._single_reflen(comps['reflen'], option, testlen) + else: + reflen = self.special_reflen + + self._reflen += reflen + + for key in ['guess','correct']: + for k in range(n): + totalcomps[key][k] += comps[key][k] + + # append per image bleu score + bleu = 1. + for k in range(n): + bleu *= (float(comps['correct'][k]) + tiny) \ + /(float(comps['guess'][k]) + small) + bleu_list[k].append(bleu ** (1./(k+1))) + ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division + if ratio < 1: + for k in range(n): + bleu_list[k][-1] *= math.exp(1 - 1/ratio) + + # if verbose > 1: + # print(comps, reflen) + + totalcomps['reflen'] = self._reflen + totalcomps['testlen'] = self._testlen + + bleus = [] + bleu = 1. + for k in range(n): + bleu *= float(totalcomps['correct'][k] + tiny) \ + / (totalcomps['guess'][k] + small) + bleus.append(bleu ** (1./(k+1))) + ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division + if ratio < 1: + for k in range(n): + bleus[k] *= math.exp(1 - 1/ratio) + + # if verbose > 0: + # print(totalcomps) + # print("ratio:", ratio) + + self._score = bleus + return self._score, bleu_list diff --git a/evalfunc/cider/__init__.py b/evalfunc/cider/__init__.py new file mode 100644 index 000000000..3f7d85bba --- /dev/null +++ b/evalfunc/cider/__init__.py @@ -0,0 +1 @@ +__author__ = 'tylin' diff --git a/evalfunc/cider/__pycache__/__init__.cpython-36.pyc b/evalfunc/cider/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 000000000..e94722c72 Binary files /dev/null and b/evalfunc/cider/__pycache__/__init__.cpython-36.pyc differ diff --git a/evalfunc/cider/__pycache__/cider.cpython-36.pyc b/evalfunc/cider/__pycache__/cider.cpython-36.pyc new file mode 100644 index 000000000..1e828105d Binary files /dev/null and b/evalfunc/cider/__pycache__/cider.cpython-36.pyc differ diff --git a/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc b/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc new file mode 100644 index 000000000..4d72af8dd Binary files /dev/null and b/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc differ diff --git a/evalfunc/cider/cider.py b/evalfunc/cider/cider.py new file mode 100644 index 000000000..e339e1753 --- /dev/null +++ b/evalfunc/cider/cider.py @@ -0,0 +1,51 @@ +# Filename: cider.py +# +# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric +# by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) +# +# Creation Date: Sun Feb 8 14:16:54 2015 +# +# Authors: Ramakrishna Vedantam and Tsung-Yi Lin + +from evalfunc.cider.cider_scorer import CiderScorer +import pdb + +class Cider: + """ + Main Class to compute the CIDEr metric + + """ + def __init__(self, test=None, refs=None, n=4, sigma=6.0): + # set cider to sum over 1 to 4-grams + self._n = n + # set the standard deviation parameter for gaussian penalty + self._sigma = sigma + + def compute_score(self, gts, res): + """ + Main function to compute CIDEr score + :param hypo_for_image (dict) : dictionary with key and value + ref_for_image (dict) : dictionary with key and value + :return: cider (float) : computed CIDEr score for the corpus + """ + + cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) + + for i in range(len(res)): + hypo = res[i] + ref = gts[i] + + # Sanity check. + assert(type(hypo) is list) + assert(len(hypo) == 1) + assert(type(ref) is list) + assert(len(ref) > 0) + + cider_scorer += (hypo[0], ref) + + (score, scores) = cider_scorer.compute_score() + + return score, scores + + def method(self): + return "CIDEr" \ No newline at end of file diff --git a/evalfunc/cider/cider_scorer.py b/evalfunc/cider/cider_scorer.py new file mode 100644 index 000000000..33579d91d --- /dev/null +++ b/evalfunc/cider/cider_scorer.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python +# Tsung-Yi Lin +# Ramakrishna Vedantam + +import copy +from collections import defaultdict +import numpy as np +import pdb +import math + +def precook(s, n=4, out=False): + """ + Takes a string as input and returns an object that can be given to + either cook_refs or cook_test. This is optional: cook_refs and cook_test + can take string arguments as well. + :param s: string : sentence to be converted into ngrams + :param n: int : number of ngrams for which representation is calculated + :return: term frequency vector for occuring ngrams + """ + words = s.split() + counts = defaultdict(int) + for k in range(1,n+1): + for i in range(len(words)-k+1): + ngram = tuple(words[i:i+k]) + counts[ngram] += 1 + return counts + +def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" + '''Takes a list of reference sentences for a single segment + and returns an object that encapsulates everything that BLEU + needs to know about them. + :param refs: list of string : reference sentences for some image + :param n: int : number of ngrams for which (ngram) representation is calculated + :return: result (list of dict) + ''' + return [precook(ref, n) for ref in refs] + +def cook_test(test, n=4): + '''Takes a test sentence and returns an object that + encapsulates everything that BLEU needs to know about it. + :param test: list of string : hypothesis sentence for some image + :param n: int : number of ngrams for which (ngram) representation is calculated + :return: result (dict) + ''' + return precook(test, n, True) + +class CiderScorer(object): + """CIDEr scorer. + """ + + def copy(self): + ''' copy the refs.''' + new = CiderScorer(n=self.n) + new.ctest = copy.copy(self.ctest) + new.crefs = copy.copy(self.crefs) + return new + + def __init__(self, test=None, refs=None, n=4, sigma=6.0): + ''' singular instance ''' + self.n = n + self.sigma = sigma + self.crefs = [] + self.ctest = [] + self.document_frequency = defaultdict(float) + self.cook_append(test, refs) + self.ref_len = None + + def cook_append(self, test, refs): + '''called by constructor and __iadd__ to avoid creating new instances.''' + + if refs is not None: + self.crefs.append(cook_refs(refs)) + if test is not None: + self.ctest.append(cook_test(test)) ## N.B.: -1 + else: + self.ctest.append(None) # lens of crefs and ctest have to match + + def size(self): + assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) + return len(self.crefs) + + def __iadd__(self, other): + '''add an instance (e.g., from another sentence).''' + + if type(other) is tuple: + ## avoid creating new CiderScorer instances + self.cook_append(other[0], other[1]) + else: + self.ctest.extend(other.ctest) + self.crefs.extend(other.crefs) + + return self + def compute_doc_freq(self): + ''' + Compute term frequency for reference data. + This will be used to compute idf (inverse document frequency later) + The term frequency is stored in the object + :return: None + ''' + for refs in self.crefs: + # refs, k ref captions of one image + for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): + self.document_frequency[ngram] += 1 + # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) + + def compute_cider(self): + def counts2vec(cnts): + """ + Function maps counts of ngram to vector of tfidf weights. + The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. + The n-th entry of array denotes length of n-grams. + :param cnts: + :return: vec (array of dict), norm (array of float), length (int) + """ + vec = [defaultdict(float) for _ in range(self.n)] + length = 0 + norm = [0.0 for _ in range(self.n)] + for (ngram,term_freq) in cnts.items(): + # give word count 1 if it doesn't appear in reference corpus + df = np.log(max(1.0, self.document_frequency[ngram])) + # ngram index + n = len(ngram)-1 + # tf (term_freq) * idf (precomputed idf) for n-grams + vec[n][ngram] = float(term_freq)*(self.ref_len - df) + # compute norm for the vector. the norm will be used for computing similarity + norm[n] += pow(vec[n][ngram], 2) + + if n == 1: + length += term_freq + norm = [np.sqrt(n) for n in norm] + return vec, norm, length + + def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): + ''' + Compute the cosine similarity of two vectors. + :param vec_hyp: array of dictionary for vector corresponding to hypothesis + :param vec_ref: array of dictionary for vector corresponding to reference + :param norm_hyp: array of float for vector corresponding to hypothesis + :param norm_ref: array of float for vector corresponding to reference + :param length_hyp: int containing length of hypothesis + :param length_ref: int containing length of reference + :return: array of score for each n-grams cosine similarity + ''' + delta = float(length_hyp - length_ref) + # measure consine similarity + val = np.array([0.0 for _ in range(self.n)]) + for n in range(self.n): + # ngram + for (ngram,count) in vec_hyp[n].items(): + # vrama91 : added clipping + val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] + + if (norm_hyp[n] != 0) and (norm_ref[n] != 0): + val[n] /= (norm_hyp[n]*norm_ref[n]) + + assert(not math.isnan(val[n])) + # vrama91: added a length based gaussian penalty + val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) + return val + + # compute log reference length + self.ref_len = np.log(float(len(self.crefs))) + + scores = [] + for test, refs in zip(self.ctest, self.crefs): + # compute vector for test captions + vec, norm, length = counts2vec(test) + # compute vector for ref captions + score = np.array([0.0 for _ in range(self.n)]) + for ref in refs: + vec_ref, norm_ref, length_ref = counts2vec(ref) + score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) + # change by vrama91 - mean of ngram scores, instead of sum + score_avg = np.mean(score) + # divide by number of references + score_avg /= len(refs) + # multiply score by 10 + score_avg *= 10.0 + # append score of an image to the score list + scores.append(score_avg) + return scores + + def compute_score(self, option=None, verbose=0): + # compute idf + self.compute_doc_freq() + # assert to check document frequency + assert(len(self.ctest) >= max(self.document_frequency.values())) + # compute cider score + score = self.compute_cider() + # debug + # print score + return np.mean(np.array(score)), np.array(score) \ No newline at end of file diff --git a/evalfunc/meteor/__init__.py b/evalfunc/meteor/__init__.py new file mode 100644 index 000000000..3f7d85bba --- /dev/null +++ b/evalfunc/meteor/__init__.py @@ -0,0 +1 @@ +__author__ = 'tylin' diff --git a/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc b/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 000000000..d2cc8d83d Binary files /dev/null and b/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc differ diff --git a/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc b/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc new file mode 100644 index 000000000..ec273dbf7 Binary files /dev/null and b/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc differ diff --git a/evalfunc/meteor/meteor-1.5.jar b/evalfunc/meteor/meteor-1.5.jar new file mode 100644 index 000000000..a833bc017 Binary files /dev/null and b/evalfunc/meteor/meteor-1.5.jar differ diff --git a/evalfunc/meteor/meteor.py b/evalfunc/meteor/meteor.py new file mode 100644 index 000000000..7d19ec494 --- /dev/null +++ b/evalfunc/meteor/meteor.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# Python wrapper for METEOR implementation, by Xinlei Chen +# Acknowledge Michael Denkowski for the generous discussion and help + +import os +import sys +import subprocess +import threading + +# Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. +METEOR_JAR = 'meteor-1.5.jar' + + +# print METEOR_JAR + +class Meteor: + + def __init__(self): + self.env = os.environ + self.env['LC_ALL'] = 'en_US.UTF_8' + self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ + '-', '-', '-stdio', '-l', 'en', '-norm'] + self.meteor_p = subprocess.Popen(self.meteor_cmd, \ + cwd=os.path.dirname(os.path.abspath(__file__)), \ + stdin=subprocess.PIPE, \ + stdout=subprocess.PIPE, \ + stderr=subprocess.PIPE, + env=self.env, universal_newlines=True, bufsize=1) + # Used to guarantee thread safety + self.lock = threading.Lock() + + def compute_score(self, gts, res): + + scores = [] + + eval_line = 'EVAL' + self.lock.acquire() + for i in range(len(res)): + assert (len(res[i]) == 1) + stat = self._stat(res[i][0], gts[i]) + eval_line += ' ||| {}'.format(stat) + + # Send to METEOR + self.meteor_p.stdin.write(eval_line + '\n') + + # Collect segment scores + for i in range(len(res)): + score = float(self.meteor_p.stdout.readline().strip()) + scores.append(score) + + # Final score + final_score = float(self.meteor_p.stdout.readline().strip()) + self.lock.release() + + return final_score, scores + + def method(self): + return "METEOR" + + def _stat(self, hypothesis_str, reference_list): + # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words + hypothesis_str = hypothesis_str.replace('|||', '').replace(' ', ' ') + score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) + self.meteor_p.stdin.write(score_line + '\n') + return self.meteor_p.stdout.readline().strip() + + def __del__(self): + self.lock.acquire() + self.meteor_p.stdin.close() + self.meteor_p.kill() + self.meteor_p.wait() + self.lock.release() \ No newline at end of file diff --git a/evalfunc/rouge/__init__.py b/evalfunc/rouge/__init__.py new file mode 100644 index 000000000..43a773e12 --- /dev/null +++ b/evalfunc/rouge/__init__.py @@ -0,0 +1 @@ +__author__ = 'vrama91' diff --git a/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc b/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 000000000..f2d1854c5 Binary files /dev/null and b/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc differ diff --git a/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc b/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc new file mode 100644 index 000000000..e53412784 Binary files /dev/null and b/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc differ diff --git a/evalfunc/rouge/rouge.py b/evalfunc/rouge/rouge.py new file mode 100644 index 000000000..3b75a91b2 --- /dev/null +++ b/evalfunc/rouge/rouge.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python + +# + +# File Name : rouge.py + +# + +# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) + +# + +# Creation Date : 2015-01-07 06:03 + +# Author : Ramakrishna Vedantam + + +import numpy as np + +import pdb + + +def my_lcs(string, sub): + """ + + Calculates longest common subsequence for a pair of tokenized strings + + :param string : list of str : tokens from a string split using whitespace + + :param sub : list of str : shorter string, also split using whitespace + + :returns: length (list of int): length of the longest common subsequence between the two strings + + + + Note: my_lcs only gives length of the longest common subsequence, not the actual LCS + + """ + + if (len(string) < len(sub)): + sub, string = string, sub + + lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)] + + for j in range(1, len(sub) + 1): + + for i in range(1, len(string) + 1): + + if (string[i - 1] == sub[j - 1]): + + lengths[i][j] = lengths[i - 1][j - 1] + 1 + + else: + + lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) + + return lengths[len(string)][len(sub)] + + +class Rouge(): + ''' + + Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set + + + + ''' + + def __init__(self): + + # vrama91: updated the value below based on discussion with Hovey + + self.beta = 1.2 + + def calc_score(self, candidate, refs): + + """ + + Compute ROUGE-L score given one candidate and references for an image + + :param candidate: str : candidate sentence to be evaluated + + :param refs: list of str : COCO reference sentences for the particular image to be evaluated + + :returns score: int (ROUGE-L score for the candidate evaluated against references) + + """ + + + assert (len(candidate) == 1) + + assert (len(refs) > 0) + + prec = [] + + rec = [] + + # split into tokens + + token_c = candidate[0].split(" ") + + for reference in refs: + # split into tokens + hh =1 + + token_r = reference.split(" ") + + # compute the longest common subsequence + + lcs = my_lcs(token_r, token_c) + + prec.append(lcs / float(len(token_c))) + + rec.append(lcs / float(len(token_r))) + + prec_max = max(prec) + + rec_max = max(rec) + + if (prec_max != 0 and rec_max != 0): + + score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max) + + else: + + score = 0.0 + + return score + + def compute_score(self, references, hypotheses): + + """ + + Computes Rouge-L score given a set of reference and candidate sentences for the dataset + + Invoked by evaluate_captions.py + + :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values + + :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values + + :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) + + """ + + # assert (gts.keys() == res.keys()) + # + # imgIds = gts.keys() + + score = [] + + for i in range(len(hypotheses)): + hypo = hypotheses[i] + ref = references[i] + + score.append(self.calc_score(hypo, ref)) + + # Sanity check. + + assert (type(hypo) is list) + + assert (len(hypo) == 1) + + assert (type(ref) is list) + + assert (len(ref) > 0) + + average_score = np.mean(np.array(score)) + + return average_score, np.array(score) + + def method(self): + + return "Rouge" \ No newline at end of file