databrickslabs · nfx · Sep 25, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
@@ -396,6 +396,9 @@ which can be used for further analysis and decision-making through the [assessme
 9. `assess_pipelines`: This task scans through all the Pipelines and identifies those pipelines that have Azure Service Principals embedded in their configurations. A list of all the pipelines with matching configurations is stored in the `$inventory.pipelines` table.
 10. `assess_azure_service_principals`: This task scans through all the clusters configurations, cluster policies, job cluster configurations, Pipeline configurations, and Warehouse configuration and identifies all the Azure Service Principals who have been given access to the Azure storage accounts via spark configurations referred in those entities. The list of all the Azure Service Principals referred in those configurations is saved in the `$inventory.azure_service_principals` table.
 11. `assess_global_init_scripts`: This task scans through all the global init scripts and identifies if there is an Azure Service Principal who has been given access to the Azure storage accounts via spark configurations referred in those scripts.
+12. `assess_dashboards`: This task scans through all the dashboards and analyzes embedded queries for migration problems. It also collects direct filesystem access patterns that require attention.
+13. `assess_workflows`: This task scans through all the jobs and tasks and analyzes notebooks and files for migration problems. It also collects direct filesystem access patterns that require attention.
+
 
 ![report](docs/assessment-report.png)
 
@@ -711,11 +714,16 @@ in the Migration dashboard.
 
 > Please note that this is an experimental workflow.
 
-The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the
-workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The `experimental-workflow-linter` workflow lints accessible code from 2 sources:
+ - all workflows/jobs present in the workspace
+ - all dashboards/queries present in the workspace
+The linting emits problems indicating what to resolve for making the code Unity Catalog compatible.
+The linting also locates direct filesystem access that need to be migrated.
 
-Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed
-in the Migration dashboard.
+Once the workflow completes:
+ - problems are stored in the `$inventory_database.workflow_problems`/`$inventory_database.query_problems` table
+ - direct filesystem access are stored in the `$inventory_database.directfs_in_paths`/`$inventory_database.directfs_in_queries` table
+ - all the above are displayed in the Migration dashboard.
 
 ![code compatibility problems](docs/code_compatibility_problems.png)
 

@@ -0,0 +1,8 @@
+## Code compatibility problems
+
+The tables below assist with verifying if workflows and dashboards are Unity Catalog compatible. It can be filtered on the path,
+problem code and workflow name.
+Each row:
+- Points to a problem detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Explains the problem with a human-readable message and a code.
+
@@ -0,0 +1,35 @@
+/*
+--title 'Workflow migration problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "path", "booleanValues": ["false", "true"], "linkUrlTemplate": "/#workspace/{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "path"},
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "workflow_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/jobs/{{ workflow_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_name"},
+        {"fieldName": "task_key", "booleanValues": ["false", "true"], "imageUrlTemplate": "{{ @ }}", "linkUrlTemplate": "/jobs/{{ workflow_id }}/tasks/{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "task_key"},
+        {"fieldName": "start_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_line"},
+        {"fieldName": "start_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "start_col"},
+        {"fieldName": "end_line", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_line"},
+        {"fieldName": "end_col", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "end_col"}
+      ]},
+    "invisibleColumns": [
+      {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"},
+      {"name": "workflow_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "workflow_id"}
+    ]
+  }}'
+*/
+SELECT
+  substring_index(path, '@databricks.com/', -1) as path,
+  path as link,
+  code,
+  message,
+  job_id AS workflow_id,
+  job_name AS workflow_name,
+  task_key,
+  start_line,
+  start_col,
+  end_line,
+  end_col
+FROM inventory.workflow_problems
@@ -0,0 +1,29 @@
+/*
+--title 'Dashboard compatibility problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"},
+        {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"},
+        {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Dashboard"},
+        {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "Query"}
+      ]},
+    "invisibleColumns": [
+      {"name": "dashboard_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_parent"},
+      {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"},
+      {"name": "query_parent", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_parent"},
+      {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"}
+    ]
+  }}'
+*/
+SELECT
+  dashboard_id,
+  dashboard_parent,
+  dashboard_name,
+  query_id,
+  query_parent,
+  query_name,
+  code,
+  message
+FROM inventory.query_problems
@@ -0,0 +1,14 @@
+---
+height: 4
+---
+
+# Direct filesystem access problems
+
+The table below assists with verifying if workflows and dashboards require direct filesystem access.
+As a reminder, `dbfs:/` is not supported in Unity Catalog, and more generally direct filesystem access is discouraged.
+Rather, data should be accessed via Unity tables.
+
+Each row:
+- Points to a direct filesystem access detected in the code using the code path, query or workflow & task reference and start/end line & column;
+- Provides the _lineage_ i.e. which `workflow -> task -> notebook...` execution sequence leads to that access.
+
@@ -0,0 +1,62 @@
+/*
+--title 'Direct filesystem access problems'
+--width 6
+--overrides '{"spec":{
+    "encodings":{
+      "columns": [
+        {"fieldName": "location", "title": "location", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_read", "title": "is_read", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "is_write", "title": "is_write", "type": "boolean", "displayAs": "boolean", "booleanValues": ["false", "true"]},
+        {"fieldName": "source", "title": "source", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ source_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "timestamp", "title": "last_modified", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage", "title": "lineage", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ lineage_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]},
+        {"fieldName": "lineage_data", "title": "lineage_data", "type": "complex", "displayAs": "json", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_start", "title": "assessment_start", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]},
+        {"fieldName": "assessment_end", "title": "assessment_end", "type": "datetime", "displayAs": "datetime", "dateTimeFormat": "ll LTS (z)", "booleanValues": ["false", "true"]}
+      ]},
+    "invisibleColumns": [
+    {"fieldName": "source_link", "title": "source_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_type", "title": "lineage_type", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_id", "title": "lineage_id", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]},
+    {"fieldName": "lineage_link", "title": "lineage_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]}
+    ]
+  }}'
+*/
+SELECT
+  path as location,
+  is_read,
+  is_write,
+  if( startswith(source_id, '/'), substring_index(source_id, '@databricks.com/', -1), split_part(source_id, '/', 2)) as source,
+  if( startswith(source_id, '/'), concat('/#workspace/', source_id), concat('/sql/editor/', split_part(source_id, '/', 2))) as source_link,
+  source_timestamp as `timestamp`,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('Workflow: ', lineage.other.name)
+    when lineage.object_type = 'TASK' then concat('Task: ', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('Notebook: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'FILE' then concat('File: ', substring_index(lineage.object_id, '@databricks.com/', -1))
+    when lineage.object_type = 'DASHBOARD' then concat('Dashboard: ', lineage.other.name)
+    when lineage.object_type = 'QUERY' then concat('Query: ', lineage.other.name)
+  end as lineage,
+  lineage.object_type as lineage_type,
+  lineage.object_id as lineage_id,
+  case
+    when lineage.object_type = 'WORKFLOW' then concat('/jobs/', lineage.object_id)
+    when lineage.object_type = 'TASK' then concat('/jobs/', split_part(lineage.object_id, '/', 1), '/tasks/', split_part(lineage.object_id, '/', 2))
+    when lineage.object_type = 'NOTEBOOK' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'FILE' then concat('/#workspace/', lineage.object_id)
+    when lineage.object_type = 'DASHBOARD' then concat('/sql/dashboards/', lineage.object_id)
+    when lineage.object_type = 'QUERY' then concat('/sql/editor/', split_part(lineage.object_id, '/', 2))
+  end as lineage_link,
+  lineage.other as lineage_data,
+  assessment_start,
+  assessment_end
+from (SELECT
+  path,
+  is_read,
+  is_write,
+  source_id,
+  source_timestamp,
+  explode(source_lineage) as lineage,
+  assessment_start_timestamp as assessment_start,
+  assessment_end_timestamp as assessment_end
+FROM inventory.directfs)
@@ -337,7 +337,8 @@ def __repr__(self):
 
     @property
     def lineage(self) -> list[LineageAtom]:
-        return [LineageAtom(object_type="PATH", object_id=str(self.path))]
+        object_type = "NOTEBOOK" if is_a_notebook(self.path) else "FILE"
+        return [LineageAtom(object_type=object_type, object_id=str(self.path))]
 
 
 class SourceContainer(abc.ABC):

@@ -86,8 +86,8 @@ def __repr__(self):
     @property
     def lineage(self) -> list[LineageAtom]:
         job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job"
-        job_lineage = LineageAtom("JOB", str(self._job.job_id), {"name": job_name})
-        task_lineage = LineageAtom("TASK", self._task.task_key)
+        job_lineage = LineageAtom("WORKFLOW", str(self._job.job_id), {"name": job_name})
+        task_lineage = LineageAtom("TASK", f"{self._job.job_id}/{self._task.task_key}")
         return [job_lineage, task_lineage]
 
 
@@ -469,8 +469,8 @@ def _collect_task_dfsas(
         job_name = job.settings.name if job.settings and job.settings.name else "<anonymous>"
         for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state):
             atoms = [
-                LineageAtom(object_type="JOB", object_id=job_id, other={"name": job_name}),
-                LineageAtom(object_type="TASK", object_id=task.task_key),
+                LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}),
+                LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"),
             ]
             yield dataclasses.replace(dfsa, source_lineage=atoms + dfsa.source_lineage)
 

@@ -66,7 +66,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
             linted_queries.add(query.id)
             problems = self.lint_query(query)
             all_problems.extend(problems)
-            dfsas = self.collect_dfsas_from_query(query)
+            dfsas = self.collect_dfsas_from_query("no-dashboard-id", query)
             all_dfsas.extend(dfsas)
         # dump problems
         logger.info(f"Saving {len(all_problems)} linting problems...")
@@ -123,7 +123,7 @@ def _lint_and_collect_from_dashboard(
                         dashboard_name=dashboard_name,
                     )
                 )
-            dfsas = self.collect_dfsas_from_query(query)
+            dfsas = self.collect_dfsas_from_query(dashboard_id, query)
             for dfsa in dfsas:
                 atom = LineageAtom(
                     object_type="DASHBOARD",
@@ -155,11 +155,11 @@ def lint_query(self, query: LegacyQuery) -> Iterable[QueryProblem]:
             )
 
     @classmethod
-    def collect_dfsas_from_query(cls, query: LegacyQuery) -> Iterable[DirectFsAccess]:
+    def collect_dfsas_from_query(cls, dashboard_id: str, query: LegacyQuery) -> Iterable[DirectFsAccess]:
         if query.query is None:
             return
         linter = DirectFsAccessSqlLinter()
-        source_id = query.id or "no id"
+        source_id = f"{dashboard_id}/{query.id}"
         source_name = query.name or "<anonymous>"
         source_timestamp = cls._read_timestamp(query.updated_at)
         source_lineage = [LineageAtom(object_type="QUERY", object_id=source_id, other={"name": source_name})]