importers: clean tags before saving (#12811)

valentijnscholten · web-flow · commit a2863b930169 · 2025-07-21T09:59:29.000-06:00
* sysdig parsers: stop using spaces in tags

* add clean_tags method

* sysdig: clean tags

* add migration to clean invalid characters

* api edgescan: use unsaved_tags

* edgescan test case fix

* force parsers to use unsaved_tags

* fix tag=None cleaning and validation

* fix []!=None

* restore reimport tag behaviour

* finetune

* rename upgrade notes
diff --git a/docs/content/en/open_source/upgrading/2.48.2.md b/docs/content/en/open_source/upgrading/2.48.2.md
@@ -0,0 +1,9 @@
+---
+title: 'Upgrading to DefectDojo Version 2.48.2'
+toc_hide: true
+weight: -20250602
+description: Tag invalid character cleanup
+---
+
+## Tag Formatting Update
+In [2.46.0](../2.46.md) tag validation was added to disallow commas, spaces and quotes in tags. Some parsers were still creating tags with invalid characters. This is fixed in this release and this release will run another data migration to replace any invalid character in tag with an underscore '`_`'.
diff --git a/dojo/db_migrations/0235_clean_tags.py b/dojo/db_migrations/0235_clean_tags.py
@@ -0,0 +1,123 @@
+# Generated by Django 5.0.8 on 2024-09-12 18:22
+
+import logging
+from django.db import migrations
+from django.db.models import Q
+
+logger = logging.getLogger(__name__)
+
+# Only apply the process to models that _could_ have tags
+model_names = [
+    "Product",
+    "Endpoint",
+    "Engagement",
+    "Test",
+    "Finding",
+    "Finding_Template",
+    "App_Analysis",
+    "Objects_Product",
+]
+
+
+def clean_tag_value(tag: str) -> str:
+    """
+    Clean each tag value by:
+    - Converting all commas to hyphens
+    - Converting all spaces to underscores
+    - Removing all single/double quotes
+    """
+    return tag.replace(",", "-").replace(" ", "_").replace('"', "").replace("'", "")
+
+
+def clean_all_tag_fields(apps, schema_editor):
+    """
+    Cleans tag values for all models in the `model_names` list, removing unwanted characters.
+    Updates both 'tags' and 'inherited_tags' fields where applicable.
+    """
+    updated_count = {}
+    for model_name in model_names:
+        TaggedModel = apps.get_model("dojo", model_name)
+        unique_tags_per_model = {}
+        count_per_model = 0
+        # Only fetch the objects with tags that contain a character in violation
+        queryset = (
+            TaggedModel.objects.filter(
+                Q(**{"tags__name__icontains": ","})
+                | Q(**{"tags__name__icontains": " "})
+                | Q(**{"tags__name__icontains": '"'})
+                | Q(**{"tags__name__icontains": "'"})
+            )
+            .distinct()
+            .prefetch_related("tags")
+        )
+        # Iterate over each instance to clean the tags. The iterator is used here
+        # to prevent loading the entire queryset into memory at once. Instead, we
+        # will only process 500 objects at a time
+        for instance in queryset.iterator(chunk_size=500):
+            # Get the current list of tags to work with
+            raw_tags = instance.tags.all()
+            # Clean each tag here while preserving the original value
+            cleaned_tags = {tag.name: clean_tag_value(tag.name) for tag in raw_tags}
+            # Quick check to avoid writing things without impact
+            if cleaned_tags:
+                instance.tags.set(list(cleaned_tags.values()), clear=True)
+                count_per_model += 1
+                # Update the running list of cleaned tags with the changes on this model
+                unique_tags_per_model.update(cleaned_tags)
+            # Add a quick logging statement every 100 objects cleaned
+            if count_per_model > 0 and count_per_model % 100 == 0:
+                logger.info(
+                    f"{TaggedModel.__name__}.tags: cleaned {count_per_model} tags..."
+                )
+        # Update the final count of the tags cleaned for the given model
+        if count_per_model:
+            updated_count[f"{TaggedModel.__name__}"] = (
+                count_per_model,
+                unique_tags_per_model,
+            )
+    """
+    Write a helpful statement about what tags were changed for each model in the list.
+    It looks something like this:
+
+    Product: 1 instances cleaned
+      "quoted string with spaces" -> quoted_string_with_spaces
+      "quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
+      "quoted,comma,tag" -> quoted-comma-tag
+    Engagement: 1 instances cleaned
+      "quoted string with spaces" -> quoted_string_with_spaces
+      "quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
+      "quoted,comma,tag" -> quoted-comma-tag
+    Test: 1 instances cleaned
+      "quoted string with spaces" -> quoted_string_with_spaces
+      "quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
+      "quoted,comma,tag" -> quoted-comma-tag
+    Finding: 1 instances cleaned
+      "quoted string with spaces" -> quoted_string_with_spaces
+      "quoted with spaces, and also commas!" -> quoted_with_spaces-_and_also_commas!
+      "quoted,comma,tag" -> quoted-comma-tag
+    """
+    for key, (count, tags) in updated_count.items():
+        logger.info(f"{key}: {count} instances cleaned")
+        for old, new in tags.items():
+            if old != new:
+                logger.info(f"  {old} -> {new}")
+
+
+def cannot_turn_back_time(apps, schema_editor):
+    """
+    We cannot possibly return to the original state without knowing
+    the original value at the time the migration is revoked. Instead
+    we will do nothing.
+    """
+    pass
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ('dojo', '0234_alter_system_settings_maximum_password_length_and_more'),
+    ]
+
+    operations = [
+        migrations.RunPython(clean_all_tag_fields, cannot_turn_back_time),
+    ]
+
diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py
@@ -16,6 +16,7 @@
     Test_Import,
 )
 from dojo.notifications.helper import create_notification
+from dojo.validators import clean_tags
 
 logger = logging.getLogger(__name__)
 deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
@@ -194,6 +195,9 @@ def process_findings(
                 unsaved_finding.date = self.scan_date.date()
             if self.service is not None:
                 unsaved_finding.service = self.service
+
+            # Force parsers to use unsaved_tags (stored in below after saving)
+            unsaved_finding.tags = None
             unsaved_finding.save(dedupe_option=False)
             finding = unsaved_finding
             # Determine how the finding should be grouped
@@ -205,9 +209,8 @@ def process_findings(
             self.process_request_response_pairs(finding)
             # Process any endpoints on the endpoint, or added on the form
             self.process_endpoints(finding, self.endpoints_to_add)
-            # Process any tags
-            if finding.unsaved_tags:
-                finding.tags = finding.unsaved_tags
+            # Parsers must use unsaved_tags to store tags, so we can clean them
+            finding.tags = clean_tags(finding.unsaved_tags)
             # Process any files
             self.process_files(finding)
             # Process vulnerability IDs
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
@@ -15,6 +15,7 @@
     Test,
     Test_Import,
 )
+from dojo.validators import clean_tags
 
 logger = logging.getLogger(__name__)
 deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
@@ -596,6 +597,8 @@ def process_finding_that_was_not_matched(
         # Save it. Don't dedupe before endpoints are added.
         unsaved_finding.save(dedupe_option=False)
         finding = unsaved_finding
+        # Force parsers to use unsaved_tags (stored in finding_post_processing function below)
+        finding.tags = None
         logger.debug(
             "Reimport created new finding as no existing finding match: "
             f"{finding.id}: {finding.title} "
@@ -624,9 +627,9 @@ def finding_post_processing(
         self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding_from_report.unsaved_endpoints)
         if len(self.endpoints_to_add) > 0:
             self.endpoint_manager.chunk_endpoints_and_disperse(finding, self.endpoints_to_add)
-        # Update finding tags
-        if finding_from_report.unsaved_tags:
-            finding.tags = finding_from_report.unsaved_tags
+        # Parsers must use unsaved_tags to store tags, so we can clean them
+        if finding.unsaved_tags:
+            finding.tags = clean_tags(finding.unsaved_tags)
         # Process any files
         if finding_from_report.unsaved_files:
             finding.unsaved_files = finding_from_report.unsaved_files
diff --git a/dojo/tools/api_edgescan/parser.py b/dojo/tools/api_edgescan/parser.py
@@ -62,7 +62,7 @@ def make_finding(self, test, vulnerability):
         finding.mitigation = vulnerability["remediation"]
         finding.active = vulnerability["status"] == "open"
         if vulnerability["asset_tags"]:
-            finding.tags = vulnerability["asset_tags"].split(",")
+            finding.unsaved_tags = vulnerability["asset_tags"].split(",")
         finding.unique_id_from_tool = vulnerability["id"]
 
         finding.unsaved_endpoints = [
diff --git a/dojo/tools/sysdig_cli/parser.py b/dojo/tools/sysdig_cli/parser.py
@@ -7,6 +7,7 @@
 
 from dojo.models import Finding
 from dojo.tools.sysdig_common.sysdig_data import SysdigData
+from dojo.validators import clean_tags
 
 
 class SysdigCLIParser:
@@ -136,7 +137,7 @@ def parse_csv(self, arr_data, test):
             # Set some finding tags
             tags = []
             if row.vulnerability_id != "":
-                tags.append("VulnId: " + row.vulnerability_id)
+                tags.append(clean_tags("VulnId:" + row.vulnerability_id))
             finding.tags = tags
             finding.dynamic_finding = False
             finding.static_finding = True
diff --git a/dojo/tools/sysdig_reports/parser.py b/dojo/tools/sysdig_reports/parser.py
@@ -7,6 +7,7 @@
 
 from dojo.models import Finding
 from dojo.tools.sysdig_common.sysdig_data import SysdigData
+from dojo.validators import clean_tags
 
 
 class SysdigReportsParser:
@@ -154,20 +155,20 @@ def parse_csv(self, arr_data, test):
             # Set some finding tags
             tags = []
             if row.k8s_cluster_name != "":
-                tags.append("Cluster: " + row.k8s_cluster_name)
+                tags.append(clean_tags("Cluster:" + row.k8s_cluster_name))
             if row.k8s_namespace_name != "":
-                tags.append("Namespace: " + row.k8s_namespace_name)
+                tags.append(clean_tags("Namespace:" + row.k8s_namespace_name))
             if row.k8s_workload_name != "":
-                tags.append("WorkloadName: " + row.k8s_workload_name)
+                tags.append(clean_tags("WorkloadName:" + row.k8s_workload_name))
             if row.package_name != "":
-                tags.append("PackageName: " + row.package_name)
+                tags.append(clean_tags("PackageName:" + row.package_name))
             if row.package_version != "":
-                tags.append("PackageVersion: " + row.package_version)
+                tags.append(clean_tags("PackageVersion:" + row.package_version))
             if row.k8s_cluster_name != "":
-                tags.append("InUse: " + str(row.in_use))
+                tags.append(clean_tags("InUse:" + str(row.in_use)))
             if row.vulnerability_id != "":
-                tags.append("VulnId: " + row.vulnerability_id)
-            finding.tags = tags
+                tags.append(clean_tags("VulnId:" + row.vulnerability_id))
+            finding.unsaved_tags = tags
             if row.k8s_cluster_name != "":
                 finding.dynamic_finding = True
                 finding.static_finding = False
diff --git a/dojo/validators.py b/dojo/validators.py
@@ -8,11 +8,15 @@
 
 logger = logging.getLogger(__name__)
 
+TAG_PATTERN = re.compile(r'[ ,\'"]')  # Matches spaces, commas, single quotes, double quotes
+
 
 def tag_validator(value: str | list[str], exception_class: Callable = ValidationError) -> None:
-    TAG_PATTERN = re.compile(r'[ ,\'"]')
     error_messages = []
 
+    if not value:
+        return
+
     if isinstance(value, list):
         error_messages.extend(f"Invalid tag: '{tag}'. Tags should not contain spaces, commas, or quotes." for tag in value if TAG_PATTERN.search(tag))
     elif isinstance(value, str):
@@ -26,6 +30,23 @@ def tag_validator(value: str | list[str], exception_class: Callable = Validation
         raise exception_class(error_messages)
 
 
+def clean_tags(value: str | list[str], exception_class: Callable = ValidationError) -> str | list[str]:
+
+    if not value:
+        return value
+
+    if isinstance(value, list):
+        # Replace ALL occurrences of problematic characters in each tag
+        return [TAG_PATTERN.sub("_", tag) for tag in value]
+
+    if isinstance(value, str):
+        # Replace ALL occurrences of problematic characters in the tag
+        return TAG_PATTERN.sub("_", value)
+
+    msg = f"Value must be a string or list of strings: {value} - {type(value)}."
+    raise exception_class(msg)
+
+
 def cvss3_validator(value: str | list[str], exception_class: Callable = ValidationError) -> None:
     logger.error("cvss3_validator called with value: %s", value)
     cvss_vectors = cvss.parser.parse_cvss_from_text(value)
diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
@@ -1902,6 +1902,10 @@ def import_scan_ui(self, engagement, payload):
     def reimport_scan_ui(self, test, payload):
         response = self.client_ui.post(reverse("re_import_scan_results", args=(test, )), payload)
         self.assertEqual(302, response.status_code, response.content[:1000])
+        # If the response URL contains 're_import_scan_results', it means the import failed
+        if "re_import_scan_results" in response.url:
+            return {"test": test}  # Return the original test ID
+        # Otherwise, extract the new test ID from the successful redirect URL
         test = Test.objects.get(id=response.url.split("/")[-1])
         return {"test": test.id}
 
diff --git a/unittests/tools/test_api_edgescan_parser.py b/unittests/tools/test_api_edgescan_parser.py
@@ -53,7 +53,7 @@ def test_parse_file_with_one_vuln_has_one_findings(self):
             self.assertEqual(finding.description, "Description Text")
             self.assertEqual(finding.mitigation, "Remediation Text")
             self.assertEqual(finding.active, True)
-            self.assertEqual(finding.tags, ["APPROVED", "Demo-Asset", "ABC Corporate", "test"])
+            self.assertEqual(finding.unsaved_tags, ["APPROVED", "Demo-Asset", "ABC Corporate", "test"])
             self.assertEqual(finding.unique_id_from_tool, 21581)
             self.assertEqual(1, len(finding.unsaved_endpoints))
             self.assertEqual(finding.unsaved_endpoints[0].host, "192.168.1.1")
@@ -77,7 +77,7 @@ def test_parse_file_with_multiple_vuln_has_multiple_finding(self):
             self.assertEqual(finding_1.description, "Description Text")
             self.assertEqual(finding_1.mitigation, "Remediation Text")
             self.assertEqual(finding_1.active, True)
-            self.assertEqual(finding_1.tags, ["APPROVED", "Demo-Asset"])
+            self.assertEqual(finding_1.unsaved_tags, ["APPROVED", "Demo-Asset"])
             self.assertEqual(finding_1.unique_id_from_tool, 21581)
             self.assertEqual(1, len(finding_1.unsaved_endpoints))
             self.assertEqual(finding_1.unsaved_endpoints[0].host, "test.example.com")
@@ -93,7 +93,7 @@ def test_parse_file_with_multiple_vuln_has_multiple_finding(self):
             self.assertEqual(finding_2.description, "Description Text 2")
             self.assertEqual(finding_2.mitigation, "Remediation Text 2")
             self.assertEqual(finding_2.active, False)
-            self.assertEqual(finding_2.tags, [])
+            self.assertEqual(finding_2.unsaved_tags, None)
             self.assertEqual(finding_2.unique_id_from_tool, 21583)
             self.assertEqual(1, len(finding_2.unsaved_endpoints))
             self.assertEqual(finding_2.unsaved_endpoints[0].host, "example.test.com")
diff --git a/unittests/tools/test_sysdig_reports_parser.py b/unittests/tools/test_sysdig_reports_parser.py
@@ -24,6 +24,16 @@ def test_sysdig_parser_with_one_criticle_vuln_has_one_findings(self):
             self.assertEqual("CVE-2018-19360", findings[0].unsaved_vulnerability_ids[0])
             self.assertEqual(None, findings[0].epss_score)
 
+            # Verify tags are created correctly without spaces
+            finding = findings[0]
+            tags_list = finding.unsaved_tags
+            self.assertIn("PackageName:com.fasterxml.jackson.core:jackson-databind", tags_list)
+            self.assertIn("PackageVersion:2.9.7", tags_list)
+            self.assertIn("VulnId:CVE-2018-19360", tags_list)
+            # Should not have K8s-related tags since this CSV doesn't include those fields
+            k8s_tags = [tag for tag in tags_list if tag.startswith(("Cluster:", "Namespace:", "WorkloadName:"))]
+            self.assertEqual(0, len(k8s_tags))
+
     def test_sysdig_parser_with_many_vuln_has_many_findings(self):
         with (get_unit_tests_scans_path("sysdig_reports") / "sysdig_reports_many_vul.csv").open(encoding="utf-8") as testfile:
             parser = SysdigReportsParser()