diff --git a/eval.py b/eval.py
index 3e9359a09..27a2d0d3f 100644
--- a/eval.py
+++ b/eval.py
@@ -7,6 +7,11 @@
 from nltk.translate.bleu_score import corpus_bleu
 import torch.nn.functional as F
 from tqdm import tqdm
+#socre evaluation packages from COCO.api
+from evalfunc.bleu.bleu import Bleu
+from evalfunc.rouge.rouge import Rouge
+from evalfunc.cider.cider import Cider
+from evalfunc.meteor.meteor import Meteor
 
 # Parameters
 data_folder = '/media/ssd/caption data'  # folder with data files saved by create_input_files.py
@@ -168,12 +173,31 @@ def evaluate(beam_size):
 
         assert len(references) == len(hypotheses)
 
-    # Calculate BLEU-4 scores
-    bleu4 = corpus_bleu(references, hypotheses)
-
-    return bleu4
+    # Calculate BLEU & CIDEr & METEOR & ROUGE scores
+    scorers = [
+        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
+        (Cider(), "CIDEr"),
+        (Meteor(), "METEOR"),
+        (Rouge(), "ROUGE_L")
+    ]
+
+    hypo = [[' '.join(hypo)] for hypo in [[str(x) for x in hypo] for hypo in hypotheses]]
+    ref = [[' '.join(reft) for reft in reftmp] for reftmp in [[[str(x) for x in reft] for reft in reftmp]for reftmp in references]]
+    
+    score = []
+    method = []
+    for scorer, method_i in scorers:
+        score_i, scores_i = scorer.compute_score(ref, hypo)
+        score.extend(score_i) if isinstance(score_i, list) else score.append(score_i)
+        method.extend(method_i) if isinstance(method_i, list) else method.append(method_i)
+    score_dict = dict(zip(method,  score))
+     
+    return score_dict
 
 
 if __name__ == '__main__':
     beam_size = 1
-    print("\nBLEU-4 score @ beam size of %d is %.4f." % (beam_size, evaluate(beam_size)))
+    score_dict = evaluate(beam_size)
+    for method, score in score_dict.items():
+        print('%s:  %.4f' % (method, score))
+        print("\n%s score @ beam size of %d is %.4f." % (method, beam_size, score))
diff --git a/evalfunc/bleu/LICENSE b/evalfunc/bleu/LICENSE
new file mode 100644
index 000000000..9ccf67790
--- /dev/null
+++ b/evalfunc/bleu/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/evalfunc/bleu/__init__.py b/evalfunc/bleu/__init__.py
new file mode 100644
index 000000000..3f7d85bba
--- /dev/null
+++ b/evalfunc/bleu/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc b/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 000000000..4d926fd03
Binary files /dev/null and b/evalfunc/bleu/__pycache__/__init__.cpython-36.pyc differ
diff --git a/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc b/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc
new file mode 100644
index 000000000..37fda478b
Binary files /dev/null and b/evalfunc/bleu/__pycache__/bleu.cpython-36.pyc differ
diff --git a/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc b/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc
new file mode 100644
index 000000000..7dd40009f
Binary files /dev/null and b/evalfunc/bleu/__pycache__/bleu_scorer.cpython-36.pyc differ
diff --git a/evalfunc/bleu/bleu.py b/evalfunc/bleu/bleu.py
new file mode 100644
index 000000000..c1bd0b4c6
--- /dev/null
+++ b/evalfunc/bleu/bleu.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# 
+# File Name : bleu.py
+#
+# Description : Wrapper for BLEU scorer.
+#
+# Creation Date : 06-01-2015
+# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
+# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+
+from .bleu_scorer import BleuScorer
+
+
+class Bleu:
+    def __init__(self, n=4):
+        # default compute Blue score up to 4
+        self._n = n
+        self._hypo_for_image = {}
+        self.ref_for_image = {}
+
+    def compute_score(self, gts, res):
+
+        bleu_scorer = BleuScorer(n=self._n)
+        for i in range(len(res)):
+            hypo = res[i]
+            ref = gts[i]
+
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) >= 1)
+
+            bleu_scorer += (hypo[0], ref)
+
+        #score, scores = bleu_scorer.compute_score(option='shortest')
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
+        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
+
+        # return (bleu, bleu_info)
+        return score, scores
+
+    def method(self):
+        return "Bleu"
diff --git a/evalfunc/bleu/bleu_scorer.py b/evalfunc/bleu/bleu_scorer.py
new file mode 100644
index 000000000..ec8220847
--- /dev/null
+++ b/evalfunc/bleu/bleu_scorer.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+
+# bleu_scorer.py
+# David Chiang <chiang@isi.edu>
+
+# Copyright (c) 2004-2006 University of Maryland. All rights
+# reserved. Do not redistribute without permission from the
+# author. Not for commercial use.
+
+# Modified by: 
+# Hao Fang <hfang@uw.edu>
+# Tsung-Yi Lin <tl483@cornell.edu>
+
+'''Provides:
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+'''
+
+import copy
+import sys, math, re
+from collections import defaultdict
+
+def precook(s, n=4, out=False):
+    """Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well."""
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return (len(words), counts)
+
+def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+
+    reflen = []
+    maxcounts = {}
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram,count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+
+    # Calculate effective reference sentence length.
+    if eff == "shortest":
+        reflen = min(reflen)
+    elif eff == "average":
+        reflen = float(sum(reflen))/len(reflen)
+
+    ## lhuang: N.B.: leave reflen computaiton to the very end!!
+    
+    ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
+
+    return (reflen, maxcounts)
+
+def cook_test(test, xxx_todo_changeme, eff=None, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    (reflen, refmaxcounts) = xxx_todo_changeme
+    testlen, counts = precook(test, n, True)
+
+    result = {}
+
+    # Calculate effective reference sentence length.
+    
+    if eff == "closest":
+        result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
+    else: ## i.e., "average" or "shortest" or None
+        result["reflen"] = reflen
+
+    result["testlen"] = testlen
+
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
+
+    result['correct'] = [0]*n
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+
+    return result
+
+class BleuScorer(object):
+    """Bleu scorer.
+    """
+
+    __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
+    # special_reflen is used in oracle (proportional effective ref len for a node).
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = BleuScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        ''' singular instance '''
+
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+        
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+
+        self._score = None ## need to recompute
+
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+
+    def score_ratio(self, option=None):
+        '''return (bleu, len_ratio) pair'''
+        return (self.fscore(option=option), self.ratio(option=option))
+
+    def score_ratio_str(self, option=None):
+        return "%.4f (%.2f)" % self.score_ratio(option)
+
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen        
+
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+
+        return self
+
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+        
+        return self.retest(new_test).compute_score()
+
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            ## avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), "incompatible BLEUs."
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None ## need to recompute
+
+        return self        
+
+    def compatible(self, other):
+        return isinstance(other, BleuScorer) and self.n == other.n
+
+    def single_reflen(self, option="average"):
+        return self._single_reflen(self.crefs[0][0], option)
+
+    def _single_reflen(self, reflens, option=None, testlen=None):
+        
+        if option == "shortest":
+            reflen = min(reflens)
+        elif option == "average":
+            reflen = float(sum(reflens))/len(reflens)
+        elif option == "closest":
+            reflen = min((abs(l-testlen), l) for l in reflens)[1]
+        else:
+            assert False, "unsupported reflen option %s" % option
+
+        return reflen
+
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+        
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15 ## so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+
+        if self._score is not None:
+            return self._score
+
+        if option is None:
+            option = "average" if len(self.crefs) == 1 else "closest"
+
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+
+        # for each sentence
+        for comps in self.ctest:            
+            testlen = comps['testlen']
+            self._testlen += testlen
+
+            if self.special_reflen is None: ## need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+
+            self._reflen += reflen
+                
+            for key in ['guess','correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) \
+                        /(float(comps['guess'][k]) + small) 
+                bleu_list[k].append(bleu ** (1./(k+1)))
+            ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1/ratio)
+
+            # if verbose > 1:
+            #     print(comps, reflen)
+
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) \
+                    / (totalcomps['guess'][k] + small)
+            bleus.append(bleu ** (1./(k+1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1/ratio)
+
+        # if verbose > 0:
+        #     print(totalcomps)
+        #     print("ratio:", ratio)
+
+        self._score = bleus
+        return self._score, bleu_list
diff --git a/evalfunc/cider/__init__.py b/evalfunc/cider/__init__.py
new file mode 100644
index 000000000..3f7d85bba
--- /dev/null
+++ b/evalfunc/cider/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/evalfunc/cider/__pycache__/__init__.cpython-36.pyc b/evalfunc/cider/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 000000000..e94722c72
Binary files /dev/null and b/evalfunc/cider/__pycache__/__init__.cpython-36.pyc differ
diff --git a/evalfunc/cider/__pycache__/cider.cpython-36.pyc b/evalfunc/cider/__pycache__/cider.cpython-36.pyc
new file mode 100644
index 000000000..1e828105d
Binary files /dev/null and b/evalfunc/cider/__pycache__/cider.cpython-36.pyc differ
diff --git a/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc b/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc
new file mode 100644
index 000000000..4d72af8dd
Binary files /dev/null and b/evalfunc/cider/__pycache__/cider_scorer.cpython-36.pyc differ
diff --git a/evalfunc/cider/cider.py b/evalfunc/cider/cider.py
new file mode 100644
index 000000000..e339e1753
--- /dev/null
+++ b/evalfunc/cider/cider.py
@@ -0,0 +1,51 @@
+# Filename: cider.py
+#
+# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
+#               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
+#
+# Creation Date: Sun Feb  8 14:16:54 2015
+#
+# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+
+from evalfunc.cider.cider_scorer import CiderScorer
+import pdb
+
+class Cider:
+    """
+    Main Class to compute the CIDEr metric 
+
+    """
+    def __init__(self, test=None, refs=None, n=4, sigma=6.0):
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        # set the standard deviation parameter for gaussian penalty
+        self._sigma = sigma
+
+    def compute_score(self, gts, res):
+        """
+        Main function to compute CIDEr score
+        :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
+                ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
+        :return: cider (float) : computed CIDEr score for the corpus 
+        """
+
+        cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
+
+        for i in range(len(res)):
+            hypo = res[i]
+            ref = gts[i]
+
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) > 0)
+
+            cider_scorer += (hypo[0], ref)
+
+        (score, scores) = cider_scorer.compute_score()
+
+        return score, scores
+
+    def method(self):
+        return "CIDEr"
\ No newline at end of file
diff --git a/evalfunc/cider/cider_scorer.py b/evalfunc/cider/cider_scorer.py
new file mode 100644
index 000000000..33579d91d
--- /dev/null
+++ b/evalfunc/cider/cider_scorer.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Tsung-Yi Lin <tl483@cornell.edu>
+# Ramakrishna Vedantam <vrama91@vt.edu>
+
+import copy
+from collections import defaultdict
+import numpy as np
+import pdb
+import math
+
+def precook(s, n=4, out=False):
+    """
+    Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well.
+    :param s: string : sentence to be converted into ngrams
+    :param n: int    : number of ngrams for which representation is calculated
+    :return: term frequency vector for occuring ngrams
+    """
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return counts
+
+def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.
+    :param refs: list of string : reference sentences for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (list of dict)
+    '''
+    return [precook(ref, n) for ref in refs]
+
+def cook_test(test, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.
+    :param test: list of string : hypothesis sentence for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (dict)
+    '''
+    return precook(test, n, True)
+
+class CiderScorer(object):
+    """CIDEr scorer.
+    """
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, sigma=6.0):
+        ''' singular instance '''
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.document_frequency = defaultdict(float)
+        self.cook_append(test, refs)
+        self.ref_len = None
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test)) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            ## avoid creating new CiderScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+
+        return self
+    def compute_doc_freq(self):
+        '''
+        Compute term frequency for reference data.
+        This will be used to compute idf (inverse document frequency later)
+        The term frequency is stored in the object
+        :return: None
+        '''
+        for refs in self.crefs:
+            # refs, k ref captions of one image
+            for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
+                self.document_frequency[ngram] += 1
+            # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+
+    def compute_cider(self):
+        def counts2vec(cnts):
+            """
+            Function maps counts of ngram to vector of tfidf weights.
+            The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
+            The n-th entry of array denotes length of n-grams.
+            :param cnts:
+            :return: vec (array of dict), norm (array of float), length (int)
+            """
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram,term_freq) in cnts.items():
+                # give word count 1 if it doesn't appear in reference corpus
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                # ngram index
+                n = len(ngram)-1
+                # tf (term_freq) * idf (precomputed idf) for n-grams
+                vec[n][ngram] = float(term_freq)*(self.ref_len - df)
+                # compute norm for the vector.  the norm will be used for computing similarity
+                norm[n] += pow(vec[n][ngram], 2)
+
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            '''
+            Compute the cosine similarity of two vectors.
+            :param vec_hyp: array of dictionary for vector corresponding to hypothesis
+            :param vec_ref: array of dictionary for vector corresponding to reference
+            :param norm_hyp: array of float for vector corresponding to hypothesis
+            :param norm_ref: array of float for vector corresponding to reference
+            :param length_hyp: int containing length of hypothesis
+            :param length_ref: int containing length of reference
+            :return: array of score for each n-grams cosine similarity
+            '''
+            delta = float(length_hyp - length_ref)
+            # measure consine similarity
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                # ngram
+                for (ngram,count) in vec_hyp[n].items():
+                    # vrama91 : added clipping
+                    val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
+
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n]*norm_ref[n])
+
+                assert(not math.isnan(val[n]))
+                # vrama91: added a length based gaussian penalty
+                val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
+            return val
+
+        # compute log reference length
+        self.ref_len = np.log(float(len(self.crefs)))
+
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            # compute vector for test captions
+            vec, norm, length = counts2vec(test)
+            # compute vector for ref captions
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            # change by vrama91 - mean of ngram scores, instead of sum
+            score_avg = np.mean(score)
+            # divide by number of references
+            score_avg /= len(refs)
+            # multiply score by 10
+            score_avg *= 10.0
+            # append score of an image to the score list
+            scores.append(score_avg)
+        return scores
+
+    def compute_score(self, option=None, verbose=0):
+        # compute idf
+        self.compute_doc_freq()
+        # assert to check document frequency
+        assert(len(self.ctest) >= max(self.document_frequency.values()))
+        # compute cider score
+        score = self.compute_cider()
+        # debug
+        # print score
+        return np.mean(np.array(score)), np.array(score)
\ No newline at end of file
diff --git a/evalfunc/meteor/__init__.py b/evalfunc/meteor/__init__.py
new file mode 100644
index 000000000..3f7d85bba
--- /dev/null
+++ b/evalfunc/meteor/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc b/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 000000000..d2cc8d83d
Binary files /dev/null and b/evalfunc/meteor/__pycache__/__init__.cpython-36.pyc differ
diff --git a/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc b/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc
new file mode 100644
index 000000000..ec273dbf7
Binary files /dev/null and b/evalfunc/meteor/__pycache__/meteor.cpython-36.pyc differ
diff --git a/evalfunc/meteor/meteor-1.5.jar b/evalfunc/meteor/meteor-1.5.jar
new file mode 100644
index 000000000..a833bc017
Binary files /dev/null and b/evalfunc/meteor/meteor-1.5.jar differ
diff --git a/evalfunc/meteor/meteor.py b/evalfunc/meteor/meteor.py
new file mode 100644
index 000000000..7d19ec494
--- /dev/null
+++ b/evalfunc/meteor/meteor.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# Python wrapper for METEOR implementation, by Xinlei Chen
+# Acknowledge Michael Denkowski for the generous discussion and help
+
+import os
+import sys
+import subprocess
+import threading
+
+# Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
+METEOR_JAR = 'meteor-1.5.jar'
+
+
+# print METEOR_JAR
+
+class Meteor:
+
+    def __init__(self):
+        self.env = os.environ
+        self.env['LC_ALL'] = 'en_US.UTF_8'
+        self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
+                           '-', '-', '-stdio', '-l', 'en', '-norm']
+        self.meteor_p = subprocess.Popen(self.meteor_cmd, \
+                                         cwd=os.path.dirname(os.path.abspath(__file__)), \
+                                         stdin=subprocess.PIPE, \
+                                         stdout=subprocess.PIPE, \
+                                         stderr=subprocess.PIPE,
+                                         env=self.env, universal_newlines=True, bufsize=1)
+        # Used to guarantee thread safety
+        self.lock = threading.Lock()
+
+    def compute_score(self, gts, res):
+
+        scores = []
+
+        eval_line = 'EVAL'
+        self.lock.acquire()
+        for i in range(len(res)):
+            assert (len(res[i]) == 1)
+            stat = self._stat(res[i][0], gts[i])
+            eval_line += ' ||| {}'.format(stat)
+
+        # Send to METEOR
+        self.meteor_p.stdin.write(eval_line + '\n')
+
+        # Collect segment scores
+        for i in range(len(res)):
+            score = float(self.meteor_p.stdout.readline().strip())
+            scores.append(score)
+
+        # Final score
+        final_score = float(self.meteor_p.stdout.readline().strip())
+        self.lock.release()
+
+        return final_score, scores
+
+    def method(self):
+        return "METEOR"
+
+    def _stat(self, hypothesis_str, reference_list):
+        # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
+        hypothesis_str = hypothesis_str.replace('|||', '').replace('  ', ' ')
+        score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
+        self.meteor_p.stdin.write(score_line + '\n')
+        return self.meteor_p.stdout.readline().strip()
+
+    def __del__(self):
+        self.lock.acquire()
+        self.meteor_p.stdin.close()
+        self.meteor_p.kill()
+        self.meteor_p.wait()
+        self.lock.release()
\ No newline at end of file
diff --git a/evalfunc/rouge/__init__.py b/evalfunc/rouge/__init__.py
new file mode 100644
index 000000000..43a773e12
--- /dev/null
+++ b/evalfunc/rouge/__init__.py
@@ -0,0 +1 @@
+__author__ = 'vrama91'
diff --git a/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc b/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 000000000..f2d1854c5
Binary files /dev/null and b/evalfunc/rouge/__pycache__/__init__.cpython-36.pyc differ
diff --git a/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc b/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc
new file mode 100644
index 000000000..e53412784
Binary files /dev/null and b/evalfunc/rouge/__pycache__/rouge.cpython-36.pyc differ
diff --git a/evalfunc/rouge/rouge.py b/evalfunc/rouge/rouge.py
new file mode 100644
index 000000000..3b75a91b2
--- /dev/null
+++ b/evalfunc/rouge/rouge.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+#
+
+# File Name : rouge.py
+
+#
+
+# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
+
+#
+
+# Creation Date : 2015-01-07 06:03
+
+# Author : Ramakrishna Vedantam <vrama91@vt.edu>
+
+
+import numpy as np
+
+import pdb
+
+
+def my_lcs(string, sub):
+    """
+
+    Calculates longest common subsequence for a pair of tokenized strings
+
+    :param string : list of str : tokens from a string split using whitespace
+
+    :param sub : list of str : shorter string, also split using whitespace
+
+    :returns: length (list of int): length of the longest common subsequence between the two strings
+
+
+
+    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
+
+    """
+
+    if (len(string) < len(sub)):
+        sub, string = string, sub
+
+    lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
+
+    for j in range(1, len(sub) + 1):
+
+        for i in range(1, len(string) + 1):
+
+            if (string[i - 1] == sub[j - 1]):
+
+                lengths[i][j] = lengths[i - 1][j - 1] + 1
+
+            else:
+
+                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
+
+    return lengths[len(string)][len(sub)]
+
+
+class Rouge():
+    '''
+
+    Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
+
+
+
+    '''
+
+    def __init__(self):
+
+        # vrama91: updated the value below based on discussion with Hovey
+
+        self.beta = 1.2
+
+    def calc_score(self, candidate, refs):
+
+        """
+
+        Compute ROUGE-L score given one candidate and references for an image
+
+        :param candidate: str : candidate sentence to be evaluated
+
+        :param refs: list of str : COCO reference sentences for the particular image to be evaluated
+
+        :returns score: int (ROUGE-L score for the candidate evaluated against references)
+
+        """
+
+
+        assert (len(candidate) == 1)
+
+        assert (len(refs) > 0)
+
+        prec = []
+
+        rec = []
+
+        # split into tokens
+
+        token_c = candidate[0].split(" ")
+
+        for reference in refs:
+            # split into tokens
+            hh =1
+
+            token_r = reference.split(" ")
+
+            # compute the longest common subsequence
+
+            lcs = my_lcs(token_r, token_c)
+
+            prec.append(lcs / float(len(token_c)))
+
+            rec.append(lcs / float(len(token_r)))
+
+        prec_max = max(prec)
+
+        rec_max = max(rec)
+
+        if (prec_max != 0 and rec_max != 0):
+
+            score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
+
+        else:
+
+            score = 0.0
+
+        return score
+
+    def compute_score(self, references, hypotheses):
+
+        """
+
+        Computes Rouge-L score given a set of reference and candidate sentences for the dataset
+
+        Invoked by evaluate_captions.py
+
+        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
+
+        :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
+
+        :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
+
+        """
+
+        # assert (gts.keys() == res.keys())
+        #
+        # imgIds = gts.keys()
+
+        score = []
+
+        for i in range(len(hypotheses)):
+            hypo = hypotheses[i]
+            ref = references[i]
+
+            score.append(self.calc_score(hypo, ref))
+
+            # Sanity check.
+
+            assert (type(hypo) is list)
+
+            assert (len(hypo) == 1)
+
+            assert (type(ref) is list)
+
+            assert (len(ref) > 0)
+
+        average_score = np.mean(np.array(score))
+
+        return average_score, np.array(score)
+
+    def method(self):
+
+        return "Rouge"
\ No newline at end of file