Skip to content

Commit 01ea0aa

Browse files
authored
fix(findincsv): memory optimization (#405)
* fix(findincsv): externalize csv as dict caching in uri pkg * fix(findincsv): cache csv by key * fix(findincsv): entries by key memory optim
1 parent e4a5271 commit 01ea0aa

File tree

4 files changed

+124
-97
lines changed

4 files changed

+124
-97
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ Types of changes
1414
- `Fixed` for any bug fixes.
1515
- `Security` in case of vulnerabilities.
1616

17+
## [1.30.2]
18+
19+
- `Fixed` mask `findInCSV` memory usage
20+
1721
## [1.30.1]
1822

1923
- `Fixed` mask `command` split command line on space protected by quote

pkg/findincsv/findincsv.go

Lines changed: 56 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ import (
66
"hash/fnv"
77
"html/template"
88
"sort"
9-
"strconv"
10-
"strings"
119
tmpl "text/template"
1210
"unicode/utf8"
1311

@@ -18,16 +16,16 @@ import (
1816
"github.com/rs/zerolog/log"
1917
)
2018

21-
type CSVKey struct {
22-
filename string
23-
lineKey string
24-
}
25-
2619
type JaccardCSV struct {
2720
csvLine model.Entry
2821
lineKey string
2922
}
3023

24+
type (
25+
fileuri string
26+
linekey string
27+
)
28+
3129
type MaskEngine struct {
3230
seeder model.Seeder
3331
templateURI *template.Template
@@ -36,8 +34,7 @@ type MaskEngine struct {
3634
temJaccardCSV *tmlmask.Engine // template to compute key for a csv entry
3735
temJaccardEntry *tmlmask.Engine // template to compute key for json entry
3836
expected string
39-
csvAllreadyRead map[string][]model.Dictionary
40-
csvEntryByKey map[CSVKey][]model.Entry
37+
csvEntryByKey map[fileuri]map[linekey][]model.Entry
4138
header bool
4239
sep rune
4340
comment rune
@@ -115,8 +112,7 @@ func NewMask(conf model.FindInCSVType, seed int64, seeder model.Seeder) (MaskEng
115112
temJaccardCSV,
116113
temJaccardEntry,
117114
expected,
118-
map[string][]model.Dictionary{},
119-
map[CSVKey][]model.Entry{},
115+
map[fileuri]map[linekey][]model.Entry{},
120116
conf.Header,
121117
sep, comment, conf.FieldsPerRecord, conf.TrimSpace,
122118
}, err
@@ -133,10 +129,10 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
133129
if err := me.templateURI.Execute(&filenameBuffer, context[0].UnpackUnordered()); err != nil {
134130
return nil, err
135131
}
136-
filename := filenameBuffer.String()
132+
filename := fileuri(filenameBuffer.String())
137133

138134
// Get ExactMatch results
139-
exactMatchFinded, exactMatchResult, err := me.ExactMatch(filename, context)
135+
exactMatchFinded, exactMatchResult, err := me.exactMatch(filename, context)
140136
if err != nil {
141137
return nil, err
142138
}
@@ -154,7 +150,7 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
154150
}
155151

156152
// getJaccardMatchResults calculates Jaccard similarity for the given CSV filename and exact match results.
157-
func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults []model.Entry, context []model.Dictionary) ([]model.Entry, error) {
153+
func (me *MaskEngine) getJaccardMatchResults(filename fileuri, exactMatchResults []model.Entry, context []model.Dictionary) ([]model.Entry, error) {
158154
var jaccardEntryBuffer bytes.Buffer
159155
if err := me.temJaccardEntry.Execute(&jaccardEntryBuffer, context[0].UnpackUnordered()); err != nil {
160156
return nil, err
@@ -163,19 +159,14 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
163159

164160
// If no exactMatch config
165161
if len(exactMatchResults) < 1 {
166-
var csvList []model.Dictionary
167-
if _, ok := me.csvAllreadyRead[filename]; !ok {
168-
var err error
169-
csvList, err = me.readCSV(filename)
170-
if err != nil {
171-
return nil, err
172-
}
173-
} else {
174-
csvList = me.csvAllreadyRead[filename]
162+
csvList, err := me.readCSV(filename)
163+
if err != nil {
164+
return nil, err
175165
}
176166

177167
var records []JaccardCSV
178-
for _, record := range csvList {
168+
for i := 0; i < csvList.Len(); i++ {
169+
record := csvList.Get(i)
179170
lineKey, err := me.computeCSVLineKey(record, false)
180171
if err != nil {
181172
return nil, err
@@ -198,46 +189,45 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
198189
return sortBySimilarity(jaccardEntryString, records), nil
199190
}
200191

201-
func (me *MaskEngine) ExactMatch(filename string, context []model.Dictionary) (bool, []model.Entry, error) {
192+
func (me *MaskEngine) exactMatch(filename fileuri, context []model.Dictionary) (bool, []model.Entry, error) {
202193
if me.temExactMatchEntry != nil && me.temExactMatchCSV != nil {
203-
var csvList []model.Dictionary
204-
if _, ok := me.csvAllreadyRead[filename]; !ok {
205-
var err error
206-
csvList, err = me.readCSV(filename)
207-
if err != nil {
208-
return false, nil, err
209-
}
194+
csvList, err := me.readCSV(filename)
195+
if err != nil {
196+
return false, nil, err
210197
}
211198

212199
var exactEntryBuffer bytes.Buffer
213200
if err := me.temExactMatchEntry.Execute(&exactEntryBuffer, context[0].UnpackUnordered()); err != nil {
214201
return false, nil, err
215202
}
216-
exactEntryString := exactEntryBuffer.String()
217-
err := me.getExactMatchCsvResult(filename, csvList)
203+
exactEntryString := linekey(exactEntryBuffer.String())
204+
err = me.getExactMatchCsvResult(filename, csvList)
218205
if err != nil {
219206
return false, []model.Entry{}, err
220207
}
221208

222-
results := me.csvEntryByKey[CSVKey{
223-
filename: filename,
224-
lineKey: exactEntryString,
225-
}]
226-
if len(results) < 1 {
227-
return false, results, nil
228-
}
229-
return true, results, nil
209+
results := me.readCsvEntryByKey(filename, exactEntryString)
210+
211+
return len(results) > 0, results, nil
230212
}
231213
return true, []model.Entry{}, nil
232214
}
233215

234-
func (me *MaskEngine) readCSV(filename string) ([]model.Dictionary, error) {
235-
recordsFromFile, err := uri.ReadCsv(filename, me.sep, me.comment, me.fieldsPerRecord, me.trimSpaces)
216+
func (me *MaskEngine) readCsvEntryByKey(filename fileuri, exactEntryString linekey) []model.Entry {
217+
cache, cacheExists := me.csvEntryByKey[filename]
218+
if !cacheExists {
219+
panic("csv file is not cached, please report the bug on GitHub CGI-FR")
220+
}
221+
222+
return cache[exactEntryString]
223+
}
224+
225+
func (me *MaskEngine) readCSV(filename fileuri) (uri.DictRecords, error) {
226+
recordsFromFile, err := uri.ReadCsvAsDicts(string(filename), me.sep, me.comment, me.fieldsPerRecord, me.trimSpaces, me.header)
236227
if err != nil {
237228
return nil, err
238229
}
239-
csvList := me.createEntriesFromCSVLines(recordsFromFile)
240-
return csvList, nil
230+
return recordsFromFile, nil
241231
}
242232

243233
func (me *MaskEngine) computeCSVLineKey(record model.Dictionary, exactMatch bool) (string, error) {
@@ -258,60 +248,32 @@ func (me *MaskEngine) computeCSVLineKey(record model.Dictionary, exactMatch bool
258248
return output.String(), nil
259249
}
260250

261-
func (me *MaskEngine) getExactMatchCsvResult(filename string, csvList []model.Dictionary) error {
262-
for _, record := range csvList {
263-
lineKey, err := me.computeCSVLineKey(record, true)
264-
if err != nil {
265-
return err
266-
}
251+
func (me *MaskEngine) getExactMatchCsvResult(filename fileuri, csvList uri.DictRecords) error {
252+
_, cacheExists := me.csvEntryByKey[filename]
253+
if !cacheExists {
254+
cache := map[linekey][]model.Entry{}
267255

268-
key := CSVKey{
269-
filename: filename,
270-
lineKey: lineKey,
271-
}
256+
for i := 0; i < csvList.Len(); i++ {
257+
record := csvList.Get(i)
258+
lineKey, err := me.computeCSVLineKey(record, true)
259+
if err != nil {
260+
return err
261+
}
272262

273-
if records, ok := me.csvEntryByKey[key]; ok {
274-
records = append(records, record)
275-
me.csvEntryByKey[key] = records
276-
} else {
277-
me.csvEntryByKey[key] = []model.Entry{record}
263+
if records, ok := cache[linekey(lineKey)]; ok {
264+
records = append(records, record)
265+
cache[linekey(lineKey)] = records
266+
} else {
267+
cache[linekey(lineKey)] = []model.Entry{record}
268+
}
278269
}
270+
271+
me.csvEntryByKey[filename] = cache
279272
}
280273

281274
return nil
282275
}
283276

284-
func (me *MaskEngine) createEntriesFromCSVLines(records uri.CSVRecords) []model.Dictionary {
285-
results := []model.Dictionary{}
286-
287-
for i := 0; i < records.Len(); i++ {
288-
record := records.Get(i)
289-
if me.header {
290-
obj := model.NewDictionary()
291-
headers := records.Get(0)
292-
for i, header := range headers {
293-
if me.trimSpaces {
294-
obj.Set(strings.TrimSpace(header), strings.TrimSpace(record[i]))
295-
} else {
296-
obj.Set(header, record[i])
297-
}
298-
}
299-
results = append(results, obj)
300-
} else {
301-
obj := model.NewDictionary()
302-
for i, value := range record {
303-
if me.trimSpaces {
304-
obj.Set(strconv.Itoa(i), strings.TrimSpace(value))
305-
} else {
306-
obj.Set(strconv.Itoa(i), value)
307-
}
308-
}
309-
results = append(results, obj)
310-
}
311-
}
312-
return results
313-
}
314-
315277
// Get numbers of result waited in expected config, by default return as at-least-one
316278
func (me *MaskEngine) getExpectedResult(results []model.Entry) (model.Entry, error) {
317279
resultCount := len(results)
@@ -337,7 +299,7 @@ func (me *MaskEngine) getExpectedResult(results []model.Entry) (model.Entry, err
337299
}
338300

339301
// JaccardSimilarity calculates the Jaccard similarity between two strings.
340-
func JaccardSimilarity(s1, s2 string) float64 {
302+
func jaccardSimilarity(s1, s2 string) float64 {
341303
if s1 == s2 {
342304
return 1.0
343305
}
@@ -398,7 +360,7 @@ func sortBySimilarity(jaccardEntryString string, list []JaccardCSV) []model.Entr
398360
var entriesWithSimilarity []EntryWithSimilarity
399361

400362
for _, record := range list {
401-
similarity := JaccardSimilarity(jaccardEntryString, record.lineKey)
363+
similarity := jaccardSimilarity(jaccardEntryString, record.lineKey)
402364
entriesWithSimilarity = append(entriesWithSimilarity, EntryWithSimilarity{Key: record.lineKey, Entry: record.csvLine, Similarity: similarity})
403365
}
404366

pkg/findincsv/findincsv_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ func TestJaccardSimilarityShouldReturnWaitedNumber(t *testing.T) {
465465
entry := "1 Boulevard Albert Einstain"
466466
csv := "1 Boulevard Albert Einstein"
467467

468-
waited := JaccardSimilarity(entry, csv)
468+
waited := jaccardSimilarity(entry, csv)
469469
expect := 0.8518518518518519
470470
assert.Equal(t, expect, waited)
471471
}

pkg/uri/uri.go

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"net/url"
2727
"os"
2828
"strconv"
29+
"strings"
2930

3031
"github.com/cgi-fr/pimo/pkg/maskingdata"
3132
"github.com/cgi-fr/pimo/pkg/model"
@@ -42,6 +43,7 @@ type Records[T model.Entry | []string] interface {
4243
type (
4344
CSVRecords Records[[]string]
4445
EntryRecords Records[model.Entry]
46+
DictRecords Records[model.Dictionary]
4547
)
4648

4749
type records[T model.Entry | []string] struct {
@@ -63,10 +65,69 @@ func (r records[T]) Collect() []T {
6365
}
6466

6567
var (
66-
cacheCSV map[string]records[[]string] = map[string]records[[]string]{}
67-
cacheEntry map[string]records[model.Entry] = map[string]records[model.Entry]{}
68+
cacheCSV map[string]records[[]string] = map[string]records[[]string]{}
69+
cacheEntry map[string]records[model.Entry] = map[string]records[model.Entry]{}
70+
cacheDict map[string]records[model.Dictionary] = map[string]records[model.Dictionary]{}
6871
)
6972

73+
func ReadCsvAsDicts(uri string, sep rune, comment rune, fieldsPerRecord int, trimLeadingSpaces bool, hasHeaders bool) (DictRecords, error) {
74+
if records, present := cacheDict[uri]; present {
75+
return records, nil
76+
}
77+
78+
csvRecords, err := ReadCsv(uri, sep, comment, fieldsPerRecord, trimLeadingSpaces)
79+
if err != nil {
80+
return nil, err
81+
}
82+
83+
headers := []string{}
84+
if hasHeaders {
85+
headerRecord := csvRecords.Get(0)
86+
for _, header := range headerRecord {
87+
if trimLeadingSpaces {
88+
headers = append(headers, strings.TrimSpace(header))
89+
} else {
90+
headers = append(headers, header)
91+
}
92+
}
93+
}
94+
95+
i := 0
96+
if hasHeaders {
97+
i = 1
98+
}
99+
100+
results := make([]model.Dictionary, csvRecords.Len())
101+
for ; i < csvRecords.Len(); i++ {
102+
record := csvRecords.Get(i)
103+
obj := model.NewDictionary()
104+
if hasHeaders {
105+
for i, header := range headers {
106+
if trimLeadingSpaces {
107+
obj.Set(header, strings.TrimSpace(record[i]))
108+
} else {
109+
obj.Set(header, record[i])
110+
}
111+
}
112+
} else {
113+
for i, value := range record {
114+
if trimLeadingSpaces {
115+
obj.Set(strconv.Itoa(i), strings.TrimSpace(value))
116+
} else {
117+
obj.Set(strconv.Itoa(i), value)
118+
}
119+
}
120+
}
121+
results[i] = obj
122+
}
123+
124+
if hasHeaders {
125+
results = results[1:]
126+
}
127+
128+
return records[model.Dictionary]{results}, nil
129+
}
130+
70131
func ReadCsv(uri string, sep rune, comment rune, fieldsPerRecord int, trimLeadingSpaces bool) (CSVRecords, error) {
71132
if records, present := cacheCSV[uri]; present {
72133
return records, nil

0 commit comments

Comments
 (0)