@@ -25,31 +25,35 @@ class WordTokenizer(object):
25
25
26
26
>>> s = '''population of 100,000'''
27
27
>>> WordTokenizer().tokenize(s)
28
- ['population', 'of', '100,000']
28
+ ['population', 'of', '100000']
29
+
30
+ >>> s = '''unit 6,'''
31
+ >>> WordTokenizer().tokenize(s)
32
+ ['unit', '6', ',']
29
33
30
34
"""
31
35
def tokenize (self , text ):
32
- #starting quotes
36
+ # starting quotes
33
37
text = re .sub (r'^\"' , r'``' , text )
34
38
text = re .sub (r'(``)' , r' \1 ' , text )
35
39
text = re .sub (r'([ (\[{<])"' , r'\1 `` ' , text )
36
40
37
- #punctuation
38
- text = re .sub (r'(?<! \d)([,])' , r' \1 ' , text ) # CHANGED :
41
+ # punctuation
42
+ text = re .sub (r'(?<= \d)([,])(?=\d) ' , ' ' , text ) # remove ',' in digits
39
43
text = re .sub (r'\.\.\.' , r' ... ' , text )
40
- text = re .sub (r'[;#$%&]' , r' \g<0> ' , text ) # CHANGED @
44
+ text = re .sub (r'[;#$%&, ]' , r' \g<0> ' , text ) # CHANGED @
41
45
42
46
43
47
text = re .sub (r'([^\.])(\.)([\]\)}>"\']*)\s*$' , r'\1 \2\3 ' , text )
44
48
text = re .sub (r'[?!]' , r' \g<0> ' , text )
45
49
46
50
text = re .sub (r"([^'])' " , r"\1 ' " , text )
47
51
48
- #parens, brackets, etc.
52
+ # parens, brackets, etc.
49
53
text = re .sub (r'[\]\[\(\)\{\}\<\>]' , r' \g<0> ' , text )
50
54
text = re .sub (r'--' , r' -- ' , text )
51
55
52
- #add extra space to make things easier
56
+ # add extra space to make things easier
53
57
text = " " + text + " "
54
58
55
59
#ending quotes
0 commit comments