split on comma after remove the comma in digits

tpeng · tpeng · commit f487f9aeec23 · 2013-11-04T16:09:26.000+08:00
diff --git a/webstruct/tokenize.py b/webstruct/tokenize.py
@@ -25,31 +25,35 @@ class WordTokenizer(object):
 
         >>> s = '''population of 100,000'''
         >>> WordTokenizer().tokenize(s)
-        ['population', 'of', '100,000']
+        ['population', 'of', '100000']
+
+        >>> s = '''unit 6,'''
+        >>> WordTokenizer().tokenize(s)
+        ['unit', '6', ',']
 
     """
     def tokenize(self, text):
-        #starting quotes
+        # starting quotes
         text = re.sub(r'^\"', r'``', text)
         text = re.sub(r'(``)', r' \1 ', text)
         text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
 
-        #punctuation
-        text = re.sub(r'(?<!\d)([,])', r' \1 ', text)     # CHANGED :
+        # punctuation
+        text = re.sub(r'(?<=\d)([,])(?=\d)', '', text)      # remove ',' in digits
         text = re.sub(r'\.\.\.', r' ... ', text)
-        text = re.sub(r'[;#$%&]', r' \g<0> ', text)         # CHANGED @
+        text = re.sub(r'[;#$%&,]', r' \g<0> ', text)         # CHANGED @
 
 
         text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
         text = re.sub(r'[?!]', r' \g<0> ', text)
 
         text = re.sub(r"([^'])' ", r"\1 ' ", text)
 
-        #parens, brackets, etc.
+        # parens, brackets, etc.
         text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
         text = re.sub(r'--', r' -- ', text)
 
-        #add extra space to make things easier
+        # add extra space to make things easier
         text = " " + text + " "
 
         #ending quotes