6
6
"hash/fnv"
7
7
"html/template"
8
8
"sort"
9
- "strconv"
10
- "strings"
11
9
tmpl "text/template"
12
10
"unicode/utf8"
13
11
@@ -18,16 +16,16 @@ import (
18
16
"github.com/rs/zerolog/log"
19
17
)
20
18
21
- type CSVKey struct {
22
- filename string
23
- lineKey string
24
- }
25
-
26
19
type JaccardCSV struct {
27
20
csvLine model.Entry
28
21
lineKey string
29
22
}
30
23
24
+ type (
25
+ fileuri string
26
+ linekey string
27
+ )
28
+
31
29
type MaskEngine struct {
32
30
seeder model.Seeder
33
31
templateURI * template.Template
@@ -36,8 +34,7 @@ type MaskEngine struct {
36
34
temJaccardCSV * tmlmask.Engine // template to compute key for a csv entry
37
35
temJaccardEntry * tmlmask.Engine // template to compute key for json entry
38
36
expected string
39
- csvAllreadyRead map [string ][]model.Dictionary
40
- csvEntryByKey map [CSVKey ][]model.Entry
37
+ csvEntryByKey map [fileuri ]map [linekey ][]model.Entry
41
38
header bool
42
39
sep rune
43
40
comment rune
@@ -115,8 +112,7 @@ func NewMask(conf model.FindInCSVType, seed int64, seeder model.Seeder) (MaskEng
115
112
temJaccardCSV ,
116
113
temJaccardEntry ,
117
114
expected ,
118
- map [string ][]model.Dictionary {},
119
- map [CSVKey ][]model.Entry {},
115
+ map [fileuri ]map [linekey ][]model.Entry {},
120
116
conf .Header ,
121
117
sep , comment , conf .FieldsPerRecord , conf .TrimSpace ,
122
118
}, err
@@ -133,10 +129,10 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
133
129
if err := me .templateURI .Execute (& filenameBuffer , context [0 ].UnpackUnordered ()); err != nil {
134
130
return nil , err
135
131
}
136
- filename := filenameBuffer .String ()
132
+ filename := fileuri ( filenameBuffer .String () )
137
133
138
134
// Get ExactMatch results
139
- exactMatchFinded , exactMatchResult , err := me .ExactMatch (filename , context )
135
+ exactMatchFinded , exactMatchResult , err := me .exactMatch (filename , context )
140
136
if err != nil {
141
137
return nil , err
142
138
}
@@ -154,7 +150,7 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
154
150
}
155
151
156
152
// getJaccardMatchResults calculates Jaccard similarity for the given CSV filename and exact match results.
157
- func (me * MaskEngine ) getJaccardMatchResults (filename string , exactMatchResults []model.Entry , context []model.Dictionary ) ([]model.Entry , error ) {
153
+ func (me * MaskEngine ) getJaccardMatchResults (filename fileuri , exactMatchResults []model.Entry , context []model.Dictionary ) ([]model.Entry , error ) {
158
154
var jaccardEntryBuffer bytes.Buffer
159
155
if err := me .temJaccardEntry .Execute (& jaccardEntryBuffer , context [0 ].UnpackUnordered ()); err != nil {
160
156
return nil , err
@@ -163,19 +159,14 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
163
159
164
160
// If no exactMatch config
165
161
if len (exactMatchResults ) < 1 {
166
- var csvList []model.Dictionary
167
- if _ , ok := me .csvAllreadyRead [filename ]; ! ok {
168
- var err error
169
- csvList , err = me .readCSV (filename )
170
- if err != nil {
171
- return nil , err
172
- }
173
- } else {
174
- csvList = me .csvAllreadyRead [filename ]
162
+ csvList , err := me .readCSV (filename )
163
+ if err != nil {
164
+ return nil , err
175
165
}
176
166
177
167
var records []JaccardCSV
178
- for _ , record := range csvList {
168
+ for i := 0 ; i < csvList .Len (); i ++ {
169
+ record := csvList .Get (i )
179
170
lineKey , err := me .computeCSVLineKey (record , false )
180
171
if err != nil {
181
172
return nil , err
@@ -198,46 +189,45 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
198
189
return sortBySimilarity (jaccardEntryString , records ), nil
199
190
}
200
191
201
- func (me * MaskEngine ) ExactMatch (filename string , context []model.Dictionary ) (bool , []model.Entry , error ) {
192
+ func (me * MaskEngine ) exactMatch (filename fileuri , context []model.Dictionary ) (bool , []model.Entry , error ) {
202
193
if me .temExactMatchEntry != nil && me .temExactMatchCSV != nil {
203
- var csvList []model.Dictionary
204
- if _ , ok := me .csvAllreadyRead [filename ]; ! ok {
205
- var err error
206
- csvList , err = me .readCSV (filename )
207
- if err != nil {
208
- return false , nil , err
209
- }
194
+ csvList , err := me .readCSV (filename )
195
+ if err != nil {
196
+ return false , nil , err
210
197
}
211
198
212
199
var exactEntryBuffer bytes.Buffer
213
200
if err := me .temExactMatchEntry .Execute (& exactEntryBuffer , context [0 ].UnpackUnordered ()); err != nil {
214
201
return false , nil , err
215
202
}
216
- exactEntryString := exactEntryBuffer .String ()
217
- err : = me .getExactMatchCsvResult (filename , csvList )
203
+ exactEntryString := linekey ( exactEntryBuffer .String () )
204
+ err = me .getExactMatchCsvResult (filename , csvList )
218
205
if err != nil {
219
206
return false , []model.Entry {}, err
220
207
}
221
208
222
- results := me .csvEntryByKey [CSVKey {
223
- filename : filename ,
224
- lineKey : exactEntryString ,
225
- }]
226
- if len (results ) < 1 {
227
- return false , results , nil
228
- }
229
- return true , results , nil
209
+ results := me .readCsvEntryByKey (filename , exactEntryString )
210
+
211
+ return len (results ) > 0 , results , nil
230
212
}
231
213
return true , []model.Entry {}, nil
232
214
}
233
215
234
- func (me * MaskEngine ) readCSV (filename string ) ([]model.Dictionary , error ) {
235
- recordsFromFile , err := uri .ReadCsv (filename , me .sep , me .comment , me .fieldsPerRecord , me .trimSpaces )
216
+ func (me * MaskEngine ) readCsvEntryByKey (filename fileuri , exactEntryString linekey ) []model.Entry {
217
+ cache , cacheExists := me .csvEntryByKey [filename ]
218
+ if ! cacheExists {
219
+ panic ("csv file is not cached, please report the bug on GitHub CGI-FR" )
220
+ }
221
+
222
+ return cache [exactEntryString ]
223
+ }
224
+
225
+ func (me * MaskEngine ) readCSV (filename fileuri ) (uri.DictRecords , error ) {
226
+ recordsFromFile , err := uri .ReadCsvAsDicts (string (filename ), me .sep , me .comment , me .fieldsPerRecord , me .trimSpaces , me .header )
236
227
if err != nil {
237
228
return nil , err
238
229
}
239
- csvList := me .createEntriesFromCSVLines (recordsFromFile )
240
- return csvList , nil
230
+ return recordsFromFile , nil
241
231
}
242
232
243
233
func (me * MaskEngine ) computeCSVLineKey (record model.Dictionary , exactMatch bool ) (string , error ) {
@@ -258,60 +248,32 @@ func (me *MaskEngine) computeCSVLineKey(record model.Dictionary, exactMatch bool
258
248
return output .String (), nil
259
249
}
260
250
261
- func (me * MaskEngine ) getExactMatchCsvResult (filename string , csvList []model.Dictionary ) error {
262
- for _ , record := range csvList {
263
- lineKey , err := me .computeCSVLineKey (record , true )
264
- if err != nil {
265
- return err
266
- }
251
+ func (me * MaskEngine ) getExactMatchCsvResult (filename fileuri , csvList uri.DictRecords ) error {
252
+ _ , cacheExists := me .csvEntryByKey [filename ]
253
+ if ! cacheExists {
254
+ cache := map [linekey ][]model.Entry {}
267
255
268
- key := CSVKey {
269
- filename : filename ,
270
- lineKey : lineKey ,
271
- }
256
+ for i := 0 ; i < csvList .Len (); i ++ {
257
+ record := csvList .Get (i )
258
+ lineKey , err := me .computeCSVLineKey (record , true )
259
+ if err != nil {
260
+ return err
261
+ }
272
262
273
- if records , ok := me .csvEntryByKey [key ]; ok {
274
- records = append (records , record )
275
- me .csvEntryByKey [key ] = records
276
- } else {
277
- me .csvEntryByKey [key ] = []model.Entry {record }
263
+ if records , ok := cache [linekey (lineKey )]; ok {
264
+ records = append (records , record )
265
+ cache [linekey (lineKey )] = records
266
+ } else {
267
+ cache [linekey (lineKey )] = []model.Entry {record }
268
+ }
278
269
}
270
+
271
+ me .csvEntryByKey [filename ] = cache
279
272
}
280
273
281
274
return nil
282
275
}
283
276
284
- func (me * MaskEngine ) createEntriesFromCSVLines (records uri.CSVRecords ) []model.Dictionary {
285
- results := []model.Dictionary {}
286
-
287
- for i := 0 ; i < records .Len (); i ++ {
288
- record := records .Get (i )
289
- if me .header {
290
- obj := model .NewDictionary ()
291
- headers := records .Get (0 )
292
- for i , header := range headers {
293
- if me .trimSpaces {
294
- obj .Set (strings .TrimSpace (header ), strings .TrimSpace (record [i ]))
295
- } else {
296
- obj .Set (header , record [i ])
297
- }
298
- }
299
- results = append (results , obj )
300
- } else {
301
- obj := model .NewDictionary ()
302
- for i , value := range record {
303
- if me .trimSpaces {
304
- obj .Set (strconv .Itoa (i ), strings .TrimSpace (value ))
305
- } else {
306
- obj .Set (strconv .Itoa (i ), value )
307
- }
308
- }
309
- results = append (results , obj )
310
- }
311
- }
312
- return results
313
- }
314
-
315
277
// Get numbers of result waited in expected config, by default return as at-least-one
316
278
func (me * MaskEngine ) getExpectedResult (results []model.Entry ) (model.Entry , error ) {
317
279
resultCount := len (results )
@@ -337,7 +299,7 @@ func (me *MaskEngine) getExpectedResult(results []model.Entry) (model.Entry, err
337
299
}
338
300
339
301
// JaccardSimilarity calculates the Jaccard similarity between two strings.
340
- func JaccardSimilarity (s1 , s2 string ) float64 {
302
+ func jaccardSimilarity (s1 , s2 string ) float64 {
341
303
if s1 == s2 {
342
304
return 1.0
343
305
}
@@ -398,7 +360,7 @@ func sortBySimilarity(jaccardEntryString string, list []JaccardCSV) []model.Entr
398
360
var entriesWithSimilarity []EntryWithSimilarity
399
361
400
362
for _ , record := range list {
401
- similarity := JaccardSimilarity (jaccardEntryString , record .lineKey )
363
+ similarity := jaccardSimilarity (jaccardEntryString , record .lineKey )
402
364
entriesWithSimilarity = append (entriesWithSimilarity , EntryWithSimilarity {Key : record .lineKey , Entry : record .csvLine , Similarity : similarity })
403
365
}
404
366
0 commit comments