Skip to content

Commit a2863b9

Browse files
importers: clean tags before saving (#12811)
* sysdig parsers: stop using spaces in tags * add clean_tags method * sysdig: clean tags * add migration to clean invalid characters * api edgescan: use unsaved_tags * edgescan test case fix * force parsers to use unsaved_tags * fix tag=None cleaning and validation * fix []!=None * restore reimport tag behaviour * finetune * rename upgrade notes
1 parent 0948ad3 commit a2863b9

File tree

11 files changed

+195
-20
lines changed

11 files changed

+195
-20
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
title: 'Upgrading to DefectDojo Version 2.48.2'
3+
toc_hide: true
4+
weight: -20250602
5+
description: Tag invalid character cleanup
6+
---
7+
8+
## Tag Formatting Update
9+
In [2.46.0](../2.46.md) tag validation was added to disallow commas, spaces and quotes in tags. Some parsers were still creating tags with invalid characters. This is fixed in this release and this release will run another data migration to replace any invalid character in tag with an underscore '`_`'.

dojo/db_migrations/0235_clean_tags.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# Generated by Django 5.0.8 on 2024-09-12 18:22
2+
3+
import logging
4+
from django.db import migrations
5+
from django.db.models import Q
6+
7+
logger = logging.getLogger(__name__)
8+
9+
# Only apply the process to models that _could_ have tags
10+
model_names = [
11+
"Product",
12+
"Endpoint",
13+
"Engagement",
14+
"Test",
15+
"Finding",
16+
"Finding_Template",
17+
"App_Analysis",
18+
"Objects_Product",
19+
]
20+
21+
22+
def clean_tag_value(tag: str) -> str:
23+
"""
24+
Clean each tag value by:
25+
- Converting all commas to hyphens
26+
- Converting all spaces to underscores
27+
- Removing all single/double quotes
28+
"""
29+
return tag.replace(",", "-").replace(" ", "_").replace('"', "").replace("'", "")
30+
31+
32+
def clean_all_tag_fields(apps, schema_editor):
33+
"""
34+
Cleans tag values for all models in the `model_names` list, removing unwanted characters.
35+
Updates both 'tags' and 'inherited_tags' fields where applicable.
36+
"""
37+
updated_count = {}
38+
for model_name in model_names:
39+
TaggedModel = apps.get_model("dojo", model_name)
40+
unique_tags_per_model = {}
41+
count_per_model = 0
42+
# Only fetch the objects with tags that contain a character in violation
43+
queryset = (
44+
TaggedModel.objects.filter(
45+
Q(**{"tags__name__icontains": ","})
46+
| Q(**{"tags__name__icontains": " "})
47+
| Q(**{"tags__name__icontains": '"'})
48+
| Q(**{"tags__name__icontains": "'"})
49+
)
50+
.distinct()
51+
.prefetch_related("tags")
52+
)
53+
# Iterate over each instance to clean the tags. The iterator is used here
54+
# to prevent loading the entire queryset into memory at once. Instead, we
55+
# will only process 500 objects at a time
56+
for instance in queryset.iterator(chunk_size=500):
57+
# Get the current list of tags to work with
58+
raw_tags = instance.tags.all()
59+
# Clean each tag here while preserving the original value
60+
cleaned_tags = {tag.name: clean_tag_value(tag.name) for tag in raw_tags}
61+
# Quick check to avoid writing things without impact
62+
if cleaned_tags:
63+
instance.tags.set(list(cleaned_tags.values()), clear=True)
64+
count_per_model += 1
65+
# Update the running list of cleaned tags with the changes on this model
66+
unique_tags_per_model.update(cleaned_tags)
67+
# Add a quick logging statement every 100 objects cleaned
68+
if count_per_model > 0 and count_per_model % 100 == 0:
69+
logger.info(
70+
f"{TaggedModel.__name__}.tags: cleaned {count_per_model} tags..."
71+
)
72+
# Update the final count of the tags cleaned for the given model
73+
if count_per_model:
74+
updated_count[f"{TaggedModel.__name__}"] = (
75+
count_per_model,
76+
unique_tags_per_model,
77+
)
78+
"""
79+
Write a helpful statement about what tags were changed for each model in the list.
80+
It looks something like this:
81+
82+
Product: 1 instances cleaned
83+
"quoted string with spaces" -> quoted_string_with_spaces
84+
"quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
85+
"quoted,comma,tag" -> quoted-comma-tag
86+
Engagement: 1 instances cleaned
87+
"quoted string with spaces" -> quoted_string_with_spaces
88+
"quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
89+
"quoted,comma,tag" -> quoted-comma-tag
90+
Test: 1 instances cleaned
91+
"quoted string with spaces" -> quoted_string_with_spaces
92+
"quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
93+
"quoted,comma,tag" -> quoted-comma-tag
94+
Finding: 1 instances cleaned
95+
"quoted string with spaces" -> quoted_string_with_spaces
96+
"quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
97+
"quoted,comma,tag" -> quoted-comma-tag
98+
"""
99+
for key, (count, tags) in updated_count.items():
100+
logger.info(f"{key}: {count} instances cleaned")
101+
for old, new in tags.items():
102+
if old != new:
103+
logger.info(f" {old} -> {new}")
104+
105+
106+
def cannot_turn_back_time(apps, schema_editor):
107+
"""
108+
We cannot possibly return to the original state without knowing
109+
the original value at the time the migration is revoked. Instead
110+
we will do nothing.
111+
"""
112+
pass
113+
114+
115+
class Migration(migrations.Migration):
116+
dependencies = [
117+
('dojo', '0234_alter_system_settings_maximum_password_length_and_more'),
118+
]
119+
120+
operations = [
121+
migrations.RunPython(clean_all_tag_fields, cannot_turn_back_time),
122+
]
123+

dojo/importers/default_importer.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
Test_Import,
1717
)
1818
from dojo.notifications.helper import create_notification
19+
from dojo.validators import clean_tags
1920

2021
logger = logging.getLogger(__name__)
2122
deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
@@ -194,6 +195,9 @@ def process_findings(
194195
unsaved_finding.date = self.scan_date.date()
195196
if self.service is not None:
196197
unsaved_finding.service = self.service
198+
199+
# Force parsers to use unsaved_tags (stored in below after saving)
200+
unsaved_finding.tags = None
197201
unsaved_finding.save(dedupe_option=False)
198202
finding = unsaved_finding
199203
# Determine how the finding should be grouped
@@ -205,9 +209,8 @@ def process_findings(
205209
self.process_request_response_pairs(finding)
206210
# Process any endpoints on the endpoint, or added on the form
207211
self.process_endpoints(finding, self.endpoints_to_add)
208-
# Process any tags
209-
if finding.unsaved_tags:
210-
finding.tags = finding.unsaved_tags
212+
# Parsers must use unsaved_tags to store tags, so we can clean them
213+
finding.tags = clean_tags(finding.unsaved_tags)
211214
# Process any files
212215
self.process_files(finding)
213216
# Process vulnerability IDs

dojo/importers/default_reimporter.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
Test,
1616
Test_Import,
1717
)
18+
from dojo.validators import clean_tags
1819

1920
logger = logging.getLogger(__name__)
2021
deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
@@ -596,6 +597,8 @@ def process_finding_that_was_not_matched(
596597
# Save it. Don't dedupe before endpoints are added.
597598
unsaved_finding.save(dedupe_option=False)
598599
finding = unsaved_finding
600+
# Force parsers to use unsaved_tags (stored in finding_post_processing function below)
601+
finding.tags = None
599602
logger.debug(
600603
"Reimport created new finding as no existing finding match: "
601604
f"{finding.id}: {finding.title} "
@@ -624,9 +627,9 @@ def finding_post_processing(
624627
self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding_from_report.unsaved_endpoints)
625628
if len(self.endpoints_to_add) > 0:
626629
self.endpoint_manager.chunk_endpoints_and_disperse(finding, self.endpoints_to_add)
627-
# Update finding tags
628-
if finding_from_report.unsaved_tags:
629-
finding.tags = finding_from_report.unsaved_tags
630+
# Parsers must use unsaved_tags to store tags, so we can clean them
631+
if finding.unsaved_tags:
632+
finding.tags = clean_tags(finding.unsaved_tags)
630633
# Process any files
631634
if finding_from_report.unsaved_files:
632635
finding.unsaved_files = finding_from_report.unsaved_files

dojo/tools/api_edgescan/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def make_finding(self, test, vulnerability):
6262
finding.mitigation = vulnerability["remediation"]
6363
finding.active = vulnerability["status"] == "open"
6464
if vulnerability["asset_tags"]:
65-
finding.tags = vulnerability["asset_tags"].split(",")
65+
finding.unsaved_tags = vulnerability["asset_tags"].split(",")
6666
finding.unique_id_from_tool = vulnerability["id"]
6767

6868
finding.unsaved_endpoints = [

dojo/tools/sysdig_cli/parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from dojo.models import Finding
99
from dojo.tools.sysdig_common.sysdig_data import SysdigData
10+
from dojo.validators import clean_tags
1011

1112

1213
class SysdigCLIParser:
@@ -136,7 +137,7 @@ def parse_csv(self, arr_data, test):
136137
# Set some finding tags
137138
tags = []
138139
if row.vulnerability_id != "":
139-
tags.append("VulnId: " + row.vulnerability_id)
140+
tags.append(clean_tags("VulnId:" + row.vulnerability_id))
140141
finding.tags = tags
141142
finding.dynamic_finding = False
142143
finding.static_finding = True

dojo/tools/sysdig_reports/parser.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from dojo.models import Finding
99
from dojo.tools.sysdig_common.sysdig_data import SysdigData
10+
from dojo.validators import clean_tags
1011

1112

1213
class SysdigReportsParser:
@@ -154,20 +155,20 @@ def parse_csv(self, arr_data, test):
154155
# Set some finding tags
155156
tags = []
156157
if row.k8s_cluster_name != "":
157-
tags.append("Cluster: " + row.k8s_cluster_name)
158+
tags.append(clean_tags("Cluster:" + row.k8s_cluster_name))
158159
if row.k8s_namespace_name != "":
159-
tags.append("Namespace: " + row.k8s_namespace_name)
160+
tags.append(clean_tags("Namespace:" + row.k8s_namespace_name))
160161
if row.k8s_workload_name != "":
161-
tags.append("WorkloadName: " + row.k8s_workload_name)
162+
tags.append(clean_tags("WorkloadName:" + row.k8s_workload_name))
162163
if row.package_name != "":
163-
tags.append("PackageName: " + row.package_name)
164+
tags.append(clean_tags("PackageName:" + row.package_name))
164165
if row.package_version != "":
165-
tags.append("PackageVersion: " + row.package_version)
166+
tags.append(clean_tags("PackageVersion:" + row.package_version))
166167
if row.k8s_cluster_name != "":
167-
tags.append("InUse: " + str(row.in_use))
168+
tags.append(clean_tags("InUse:" + str(row.in_use)))
168169
if row.vulnerability_id != "":
169-
tags.append("VulnId: " + row.vulnerability_id)
170-
finding.tags = tags
170+
tags.append(clean_tags("VulnId:" + row.vulnerability_id))
171+
finding.unsaved_tags = tags
171172
if row.k8s_cluster_name != "":
172173
finding.dynamic_finding = True
173174
finding.static_finding = False

dojo/validators.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88

99
logger = logging.getLogger(__name__)
1010

11+
TAG_PATTERN = re.compile(r'[ ,\'"]') # Matches spaces, commas, single quotes, double quotes
12+
1113

1214
def tag_validator(value: str | list[str], exception_class: Callable = ValidationError) -> None:
13-
TAG_PATTERN = re.compile(r'[ ,\'"]')
1415
error_messages = []
1516

17+
if not value:
18+
return
19+
1620
if isinstance(value, list):
1721
error_messages.extend(f"Invalid tag: '{tag}'. Tags should not contain spaces, commas, or quotes." for tag in value if TAG_PATTERN.search(tag))
1822
elif isinstance(value, str):
@@ -26,6 +30,23 @@ def tag_validator(value: str | list[str], exception_class: Callable = Validation
2630
raise exception_class(error_messages)
2731

2832

33+
def clean_tags(value: str | list[str], exception_class: Callable = ValidationError) -> str | list[str]:
34+
35+
if not value:
36+
return value
37+
38+
if isinstance(value, list):
39+
# Replace ALL occurrences of problematic characters in each tag
40+
return [TAG_PATTERN.sub("_", tag) for tag in value]
41+
42+
if isinstance(value, str):
43+
# Replace ALL occurrences of problematic characters in the tag
44+
return TAG_PATTERN.sub("_", value)
45+
46+
msg = f"Value must be a string or list of strings: {value} - {type(value)}."
47+
raise exception_class(msg)
48+
49+
2950
def cvss3_validator(value: str | list[str], exception_class: Callable = ValidationError) -> None:
3051
logger.error("cvss3_validator called with value: %s", value)
3152
cvss_vectors = cvss.parser.parse_cvss_from_text(value)

unittests/test_import_reimport.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,6 +1902,10 @@ def import_scan_ui(self, engagement, payload):
19021902
def reimport_scan_ui(self, test, payload):
19031903
response = self.client_ui.post(reverse("re_import_scan_results", args=(test, )), payload)
19041904
self.assertEqual(302, response.status_code, response.content[:1000])
1905+
# If the response URL contains 're_import_scan_results', it means the import failed
1906+
if "re_import_scan_results" in response.url:
1907+
return {"test": test} # Return the original test ID
1908+
# Otherwise, extract the new test ID from the successful redirect URL
19051909
test = Test.objects.get(id=response.url.split("/")[-1])
19061910
return {"test": test.id}
19071911

unittests/tools/test_api_edgescan_parser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_parse_file_with_one_vuln_has_one_findings(self):
5353
self.assertEqual(finding.description, "Description Text")
5454
self.assertEqual(finding.mitigation, "Remediation Text")
5555
self.assertEqual(finding.active, True)
56-
self.assertEqual(finding.tags, ["APPROVED", "Demo-Asset", "ABC Corporate", "test"])
56+
self.assertEqual(finding.unsaved_tags, ["APPROVED", "Demo-Asset", "ABC Corporate", "test"])
5757
self.assertEqual(finding.unique_id_from_tool, 21581)
5858
self.assertEqual(1, len(finding.unsaved_endpoints))
5959
self.assertEqual(finding.unsaved_endpoints[0].host, "192.168.1.1")
@@ -77,7 +77,7 @@ def test_parse_file_with_multiple_vuln_has_multiple_finding(self):
7777
self.assertEqual(finding_1.description, "Description Text")
7878
self.assertEqual(finding_1.mitigation, "Remediation Text")
7979
self.assertEqual(finding_1.active, True)
80-
self.assertEqual(finding_1.tags, ["APPROVED", "Demo-Asset"])
80+
self.assertEqual(finding_1.unsaved_tags, ["APPROVED", "Demo-Asset"])
8181
self.assertEqual(finding_1.unique_id_from_tool, 21581)
8282
self.assertEqual(1, len(finding_1.unsaved_endpoints))
8383
self.assertEqual(finding_1.unsaved_endpoints[0].host, "test.example.com")
@@ -93,7 +93,7 @@ def test_parse_file_with_multiple_vuln_has_multiple_finding(self):
9393
self.assertEqual(finding_2.description, "Description Text 2")
9494
self.assertEqual(finding_2.mitigation, "Remediation Text 2")
9595
self.assertEqual(finding_2.active, False)
96-
self.assertEqual(finding_2.tags, [])
96+
self.assertEqual(finding_2.unsaved_tags, None)
9797
self.assertEqual(finding_2.unique_id_from_tool, 21583)
9898
self.assertEqual(1, len(finding_2.unsaved_endpoints))
9999
self.assertEqual(finding_2.unsaved_endpoints[0].host, "example.test.com")

0 commit comments

Comments
 (0)