Skip to content

Commit 625f16a

Browse files
committed
Merge branch 'main' into crawler-snapshot-history
2 parents 15dd48d + 564e504 commit 625f16a

File tree

23 files changed

+357
-256
lines changed

23 files changed

+357
-256
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ integration:
2323
hatch run integration
2424

2525
coverage:
26-
hatch run coverage && open htmlcov/index.html
26+
hatch run coverage; status=$$?; [ -e "htmlcov/index.html" ] && open htmlcov/index.html; exit $$status
2727

2828
known:
2929
hatch run python src/databricks/labs/ucx/source_code/known.py

README.md

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
9292
* [Metastore related commands](#metastore-related-commands)
9393
* [`show-all-metastores` command](#show-all-metastores-command)
9494
* [`assign-metastore` command](#assign-metastore-command)
95+
* [`create-ucx-catalog` command](#create-ucx-catalog-command)
9596
* [Table migration commands](#table-migration-commands)
9697
* [`principal-prefix-access` command](#principal-prefix-access-command)
9798
* [Access for AWS S3 Buckets](#access-for-aws-s3-buckets)
@@ -1168,9 +1169,23 @@ a region, and you want to see which ones are available for assignment.
11681169
databricks labs ucx assign-metastore --workspace-id <workspace-id> [--metastore-id <metastore-id>]
11691170
```
11701171

1171-
This command assigns a metastore to a workspace with `workspace-id`. If there is only a single metastore in the workspace
1172-
region, it will be automatically assigned to the workspace. If there are multiple metastores available, you need to specify
1173-
the metastore id of the metastore you want to assign to the workspace.
1172+
This command assigns a metastore to a workspace with `--workspace-id`. If there is only a single metastore in the
1173+
workspace region, the command automatically assigns that metastore to the workspace. If there are multiple metastores
1174+
available, the command prompts for specification of the metastore (id) you want to assign to the workspace.
1175+
1176+
[[back to top](#databricks-labs-ucx)]
1177+
1178+
## `create-ucx-catalog` command
1179+
1180+
```commandline
1181+
databricks labs ucx create-ucx-catalog
1182+
16:12:59 INFO [d.l.u.hive_metastore.catalog_schema] Validating UC catalog: ucx
1183+
Please provide storage location url for catalog: ucx (default: metastore): ...
1184+
16:13:01 INFO [d.l.u.hive_metastore.catalog_schema] Creating UC catalog: ucx
1185+
```
1186+
1187+
Create and setup UCX artifact catalog. Amongst other things, the artifacts are used for tracking the migration progress
1188+
across workspaces.
11741189

11751190
# Table migration commands
11761191

labs.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,15 +261,18 @@ commands:
261261

262262
- name: assign-metastore
263263
is_account_level: true
264-
description: Enable Unity Catalog features on a workspace by assign a metastore to it
264+
description: Enable Unity Catalog features on a workspace by assigning a metastore to it.
265265
flags:
266266
- name: workspace-id
267-
description: (Optional) Workspace ID to assign a metastore to
267+
description: Workspace ID to assign a metastore to
268268
- name: metastore-id
269269
description: (Optional) If there are multiple metastores in the region, specify the metastore ID to assign
270270
- name: default-catalog
271271
description: (Optional) Default catalog to assign to the workspace. If not provided, it will be hive_metastore
272272

273+
- name: create-ucx-catalog
274+
description: Create UCX artifact catalog
275+
273276
- name: migrate-tables
274277
description: |
275278
Trigger the `migrate-tables` workflow and, optionally, `migrate-external-hiveserde-tables-in-place-experimental`

src/databricks/labs/ucx/account/metastores.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,38 +27,33 @@ def show_all_metastores(self, workspace_id: str | None = None):
2727
def assign_metastore(
2828
self,
2929
prompts: Prompts,
30-
str_workspace_id: str | None = None,
30+
workspace_id: int,
31+
*,
3132
metastore_id: str | None = None,
3233
default_catalog: str | None = None,
3334
):
34-
if not str_workspace_id:
35-
workspace_choices = self._get_all_workspaces()
36-
workspace_id = prompts.choice_from_dict("Please select a workspace:", workspace_choices)
37-
else:
38-
workspace_id = int(str_workspace_id)
39-
if not metastore_id:
35+
if metastore_id is None:
4036
# search for all matching metastores
4137
metastore_choices = self._get_all_metastores(self._get_region(workspace_id))
4238
if len(metastore_choices) == 0:
43-
raise ValueError(f"No matching metastore found for workspace {workspace_id}")
39+
raise ValueError(f"No matching metastore found for workspace: {workspace_id}")
4440
# if there are multiple matches, prompt users to select one
4541
if len(metastore_choices) > 1:
4642
metastore_id = prompts.choice_from_dict(
4743
"Multiple metastores found, please select one:", metastore_choices
4844
)
4945
else:
5046
metastore_id = list(metastore_choices.values())[0]
51-
if metastore_id is not None:
52-
self._ac.metastore_assignments.create(workspace_id, metastore_id)
47+
self._ac.metastore_assignments.create(workspace_id, metastore_id)
5348
# set the default catalog using the default_namespace setting API
5449
if default_catalog is not None:
5550
self._set_default_catalog(workspace_id, default_catalog)
5651

57-
def _get_region(self, workspace_id: int) -> str:
52+
def _get_region(self, workspace_id: int) -> str | None:
5853
workspace = self._ac.workspaces.get(workspace_id)
5954
if self._ac.config.is_aws:
60-
return str(workspace.aws_region)
61-
return str(workspace.location)
55+
return workspace.aws_region
56+
return workspace.location
6257

6358
def _get_all_workspaces(self) -> dict[str, int]:
6459
output = dict[str, int]()

src/databricks/labs/ucx/cli.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,11 +590,27 @@ def assign_metastore(
590590
workspace_id: str | None = None,
591591
metastore_id: str | None = None,
592592
default_catalog: str | None = None,
593+
ctx: AccountContext | None = None,
593594
):
594595
"""Assign metastore to a workspace"""
595596
logger.info(f"Account ID: {a.config.account_id}")
596-
ctx = AccountContext(a)
597-
ctx.account_metastores.assign_metastore(ctx.prompts, workspace_id, metastore_id, default_catalog)
597+
ctx = ctx or AccountContext(a)
598+
ctx.account_metastores.assign_metastore(
599+
ctx.prompts,
600+
workspace_id,
601+
metastore_id=metastore_id,
602+
default_catalog=default_catalog,
603+
)
604+
605+
606+
@ucx.command
607+
def create_ucx_catalog(w: WorkspaceClient, prompts: Prompts, ctx: WorkspaceContext | None = None) -> None:
608+
"""Create and setup UCX artifact catalog
609+
610+
Amongst other things, the artifacts are used for tracking the migration progress across workspaces.
611+
"""
612+
workspace_context = ctx or WorkspaceContext(w)
613+
workspace_context.catalog_schema.create_ucx_catalog(prompts)
598614

599615

600616
@ucx.command

src/databricks/labs/ucx/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes
1111
__version__ = 2
1212

1313
inventory_database: str
14+
ucx_catalog: str = "ucx" # Catalog to store UCX artifact tables (shared across workspaces)
1415
# Group name conversion parameters.
1516
workspace_group_regex: str | None = None
1617
workspace_group_replace: str | None = None
@@ -25,7 +26,7 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes
2526
connect: Config | None = None
2627
num_threads: int | None = 10
2728
database_to_catalog_mapping: dict[str, str] | None = None
28-
default_catalog: str | None = "ucx_default"
29+
default_catalog: str | None = "ucx_default" # DEPRECATED: Keeping to avoid errors when loading old configurations
2930
log_level: str | None = "INFO"
3031

3132
# Starting path for notebooks and directories crawler

src/databricks/labs/ucx/contexts/account_cli.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from databricks.sdk import AccountClient
55

6-
76
from databricks.labs.ucx.account.aggregate import AccountAggregate
87
from databricks.labs.ucx.account.metastores import AccountMetastores
98
from databricks.labs.ucx.account.workspaces import AccountWorkspaces

src/databricks/labs/ucx/contexts/application.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
1919
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
2020
from databricks.sdk import AccountClient, WorkspaceClient, core
21-
from databricks.sdk.errors import ResourceDoesNotExist
2221
from databricks.sdk.service import sql
2322

2423
from databricks.labs.ucx.account.workspaces import WorkspaceInfo
@@ -305,18 +304,15 @@ def aws_acl(self):
305304
)
306305

307306
@cached_property
308-
def principal_locations(self):
309-
eligible_locations = {}
310-
try:
307+
def principal_locations_retriever(self):
308+
def inner():
311309
if self.is_azure:
312-
eligible_locations = self.azure_acl.get_eligible_locations_principals()
310+
return self.azure_acl.get_eligible_locations_principals()
313311
if self.is_aws:
314-
eligible_locations = self.aws_acl.get_eligible_locations_principals()
315-
if self.is_gcp:
316-
raise NotImplementedError("Not implemented for GCP.")
317-
except ResourceDoesNotExist:
318-
pass
319-
return eligible_locations
312+
return self.aws_acl.get_eligible_locations_principals()
313+
raise NotImplementedError("Not implemented for GCP.")
314+
315+
return inner
320316

321317
@cached_property
322318
def principal_acl(self):
@@ -326,7 +322,7 @@ def principal_acl(self):
326322
self.installation,
327323
self.tables_crawler,
328324
self.mounts_crawler,
329-
self.principal_locations,
325+
self.principal_locations_retriever,
330326
)
331327

332328
@cached_property
@@ -354,6 +350,7 @@ def catalog_schema(self):
354350
self.principal_acl,
355351
self.sql_backend,
356352
self.grants_crawler,
353+
self.config.ucx_catalog,
357354
)
358355

359356
@cached_property

src/databricks/labs/ucx/hive_metastore/catalog_schema.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,49 @@
1818

1919

2020
class CatalogSchema:
21+
2122
def __init__(
2223
self,
2324
ws: WorkspaceClient,
2425
table_mapping: TableMapping,
2526
principal_grants: PrincipalACL,
2627
sql_backend: SqlBackend,
2728
grants_crawler: GrantsCrawler,
29+
ucx_catalog: str,
2830
):
2931
self._ws = ws
3032
self._table_mapping = table_mapping
3133
self._external_locations = self._ws.external_locations.list()
3234
self._principal_grants = principal_grants
3335
self._backend = sql_backend
3436
self._hive_grants_crawler = grants_crawler
37+
self._ucx_catalog = ucx_catalog
38+
39+
def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | None = None) -> None:
40+
"""Create the UCX catalog.
41+
42+
Args:
43+
prompts : Prompts
44+
The prompts object to use for interactive input.
45+
properties : (dict[str, str] | None), default None
46+
The properties to pass to the catalog. If None, no properties are passed.
47+
"""
48+
try:
49+
self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties)
50+
except BadRequest as e:
51+
if "already exists" in str(e):
52+
logger.warning(f"Catalog '{self._ucx_catalog}' already exists. Skipping.")
53+
return
54+
raise
3555

3656
def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
3757
candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas()
3858
for candidate_catalog in candidate_catalogs:
3959
try:
40-
self._create_catalog_validate(candidate_catalog, prompts)
60+
self._create_catalog_validate(candidate_catalog, prompts, properties=None)
4161
except BadRequest as e:
4262
if "already exists" in str(e):
43-
logger.warning(f"Catalog {candidate_catalog} already exists. Skipping.")
63+
logger.warning(f"Catalog '{candidate_catalog}' already exists. Skipping.")
4464
continue
4565
for candidate_catalog, schemas in candidate_schemas.items():
4666
for candidate_schema in schemas:
@@ -49,7 +69,7 @@ def create_all_catalogs_schemas(self, prompts: Prompts) -> None:
4969
except BadRequest as e:
5070
if "already exists" in str(e):
5171
logger.warning(
52-
f"Schema {candidate_schema} in catalog {candidate_catalog} " f"already exists. Skipping."
72+
f"Schema '{candidate_schema}' in catalog '{candidate_catalog}' already exists. Skipping."
5373
)
5474
continue
5575
self._apply_from_legacy_table_acls()
@@ -121,20 +141,19 @@ def _get_database_source_target_mapping(self) -> dict[str, list[SchemaInfo]]:
121141
src_trg_schema_mapping[table_mapping.src_schema].append(schema)
122142
return src_trg_schema_mapping
123143

124-
def _create_catalog_validate(self, catalog, prompts: Prompts):
125-
logger.info(f"Creating UC catalog: {catalog}")
126-
# create catalogs
144+
def _create_catalog_validate(self, catalog: str, prompts: Prompts, *, properties: dict[str, str] | None) -> None:
145+
logger.info(f"Validating UC catalog: {catalog}")
127146
attempts = 3
128147
while True:
129148
catalog_storage = prompts.question(
130-
f"Please provide storage location url for catalog:{catalog}.", default="metastore"
149+
f"Please provide storage location url for catalog: {catalog}", default="metastore"
131150
)
132151
if self._validate_location(catalog_storage):
133152
break
134153
attempts -= 1
135154
if attempts == 0:
136155
raise NotFound(f"Failed to validate location for {catalog} catalog")
137-
self._create_catalog(catalog, catalog_storage)
156+
self._create_catalog(catalog, catalog_storage, properties=properties)
138157

139158
def _list_existing(self) -> tuple[set[str], dict[str, set[str]]]:
140159
"""generate a list of existing UC catalogs and schema."""
@@ -199,12 +218,17 @@ def _validate_location(self, location: str):
199218
return True
200219
return False
201220

202-
def _create_catalog(self, catalog, catalog_storage):
221+
def _create_catalog(self, catalog: str, catalog_storage: str, *, properties: dict[str, str] | None) -> None:
203222
logger.info(f"Creating UC catalog: {catalog}")
204223
if catalog_storage == "metastore":
205-
self._ws.catalogs.create(catalog, comment="Created by UCX")
224+
self._ws.catalogs.create(catalog, comment="Created by UCX", properties=properties)
206225
else:
207-
self._ws.catalogs.create(catalog, storage_root=catalog_storage, comment="Created by UCX")
226+
self._ws.catalogs.create(
227+
catalog,
228+
storage_root=catalog_storage,
229+
comment="Created by UCX",
230+
properties=properties,
231+
)
208232

209233
def _create_schema(self, catalog, schema):
210234
logger.info(f"Creating UC schema: {schema} in catalog: {catalog}")

src/databricks/labs/ucx/hive_metastore/grants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ def __init__(
579579
installation: Installation,
580580
tables_crawler: TablesCrawler,
581581
mounts_crawler: Mounts,
582-
cluster_locations: list[ComputeLocations],
582+
cluster_locations: Callable[[], list[ComputeLocations]],
583583
):
584584
self._backend = backend
585585
self._ws = ws
@@ -593,7 +593,7 @@ def get_interactive_cluster_grants(self) -> list[Grant]:
593593
mounts = list(self._mounts_crawler.snapshot())
594594
grants: set[Grant] = set()
595595

596-
for compute_location in self._compute_locations:
596+
for compute_location in self._compute_locations():
597597
principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
598598
if len(principals) == 0:
599599
continue
@@ -697,7 +697,7 @@ def apply_location_acl(self):
697697
"CREATE EXTERNAL VOLUME and READ_FILES for existing eligible interactive cluster users"
698698
)
699699
# get the eligible location mapped for each interactive cluster
700-
for compute_location in self._compute_locations:
700+
for compute_location in self._compute_locations():
701701
# get interactive cluster users
702702
principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type)
703703
if len(principals) == 0:

0 commit comments

Comments
 (0)