Skip to content

Commit f487f9a

Browse files
committed
split on comma after remove the comma in digits
1 parent 4962dec commit f487f9a

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

webstruct/tokenize.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,31 +25,35 @@ class WordTokenizer(object):
2525
2626
>>> s = '''population of 100,000'''
2727
>>> WordTokenizer().tokenize(s)
28-
['population', 'of', '100,000']
28+
['population', 'of', '100000']
29+
30+
>>> s = '''unit 6,'''
31+
>>> WordTokenizer().tokenize(s)
32+
['unit', '6', ',']
2933
3034
"""
3135
def tokenize(self, text):
32-
#starting quotes
36+
# starting quotes
3337
text = re.sub(r'^\"', r'``', text)
3438
text = re.sub(r'(``)', r' \1 ', text)
3539
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
3640

37-
#punctuation
38-
text = re.sub(r'(?<!\d)([,])', r' \1 ', text) # CHANGED :
41+
# punctuation
42+
text = re.sub(r'(?<=\d)([,])(?=\d)', '', text) # remove ',' in digits
3943
text = re.sub(r'\.\.\.', r' ... ', text)
40-
text = re.sub(r'[;#$%&]', r' \g<0> ', text) # CHANGED @
44+
text = re.sub(r'[;#$%&,]', r' \g<0> ', text) # CHANGED @
4145

4246

4347
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
4448
text = re.sub(r'[?!]', r' \g<0> ', text)
4549

4650
text = re.sub(r"([^'])' ", r"\1 ' ", text)
4751

48-
#parens, brackets, etc.
52+
# parens, brackets, etc.
4953
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
5054
text = re.sub(r'--', r' -- ', text)
5155

52-
#add extra space to make things easier
56+
# add extra space to make things easier
5357
text = " " + text + " "
5458

5559
#ending quotes

0 commit comments

Comments
 (0)