|
1 | 1 | import logging
|
2 | 2 | from collections.abc import Iterable
|
3 | 3 | from dataclasses import dataclass
|
| 4 | +from functools import partial |
4 | 5 |
|
| 6 | +from databricks.labs.blueprint.parallel import Threads |
5 | 7 | from databricks.labs.lsql.backends import SqlBackend
|
6 | 8 |
|
7 | 9 | from databricks.labs.ucx.framework.crawlers import CrawlerBase
|
8 | 10 | from databricks.labs.ucx.framework.utils import escape_sql_identifier
|
9 | 11 | from databricks.labs.ucx.hive_metastore import TablesCrawler
|
| 12 | +from databricks.labs.ucx.hive_metastore.tables import Table |
10 | 13 |
|
11 | 14 | logger = logging.getLogger(__name__)
|
12 | 15 |
|
@@ -40,43 +43,43 @@ def _crawl(self) -> Iterable[TableSize]:
|
40 | 43 | """Crawls and lists tables using table crawler
|
41 | 44 | Identifies DBFS root tables and calculates the size for these.
|
42 | 45 | """
|
| 46 | + tasks = [] |
43 | 47 | for table in self._tables_crawler.snapshot():
|
44 | 48 | if not table.kind == "TABLE":
|
45 | 49 | continue
|
46 | 50 | if not table.is_dbfs_root:
|
47 | 51 | continue
|
48 |
| - size_in_bytes = self._safe_get_table_size(table.key) |
49 |
| - if size_in_bytes is None: |
50 |
| - continue # table does not exist anymore or is corrupted |
51 |
| - |
52 |
| - yield TableSize( |
53 |
| - catalog=table.catalog, database=table.database, name=table.name, size_in_bytes=size_in_bytes |
54 |
| - ) |
| 52 | + tasks.append(partial(self._safe_get_table_size, table)) |
| 53 | + return Threads.strict('DBFS root table sizes', tasks) |
55 | 54 |
|
56 | 55 | def _try_fetch(self) -> Iterable[TableSize]:
|
57 | 56 | """Tries to load table information from the database or throws TABLE_OR_VIEW_NOT_FOUND error"""
|
58 | 57 | for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
|
59 | 58 | yield TableSize(*row)
|
60 | 59 |
|
61 |
| - def _safe_get_table_size(self, table_full_name: str) -> int | None: |
62 |
| - logger.debug(f"Evaluating {table_full_name} table size.") |
| 60 | + def _safe_get_table_size(self, table: Table) -> TableSize | None: |
| 61 | + logger.debug(f"Evaluating {table.key} table size.") |
63 | 62 | try:
|
64 | 63 | # refresh table statistics to avoid stale stats in HMS
|
65 |
| - self._backend.execute(f"ANALYZE table {escape_sql_identifier(table_full_name)} compute STATISTICS NOSCAN") |
66 |
| - # pylint: disable-next=protected-access |
67 |
| - return self._spark._jsparkSession.table(table_full_name).queryExecution().analyzed().stats().sizeInBytes() |
| 64 | + self._backend.execute(f"ANALYZE table {table.safe_sql_key} compute STATISTICS NOSCAN") |
| 65 | + jvm_df = self._spark._jsparkSession.table(table.safe_sql_key) # pylint: disable=protected-access |
| 66 | + size_in_bytes = jvm_df.queryExecution().analyzed().stats().sizeInBytes() |
| 67 | + return TableSize( |
| 68 | + catalog=table.catalog, |
| 69 | + database=table.database, |
| 70 | + name=table.name, |
| 71 | + size_in_bytes=size_in_bytes, |
| 72 | + ) |
68 | 73 | except Exception as e: # pylint: disable=broad-exception-caught
|
69 | 74 | if "[TABLE_OR_VIEW_NOT_FOUND]" in str(e) or "[DELTA_TABLE_NOT_FOUND]" in str(e):
|
70 |
| - logger.warning(f"Failed to evaluate {table_full_name} table size. Table not found.") |
| 75 | + logger.warning(f"Failed to evaluate {table.key} table size. Table not found.") |
71 | 76 | return None
|
72 | 77 | if "[DELTA_INVALID_FORMAT]" in str(e):
|
73 |
| - logger.warning( |
74 |
| - f"Unable to read Delta table {table_full_name}, please check table structure and try again." |
75 |
| - ) |
| 78 | + logger.warning(f"Unable to read Delta table {table.key}, please check table structure and try again.") |
76 | 79 | return None
|
77 | 80 | if "[DELTA_MISSING_TRANSACTION_LOG]" in str(e):
|
78 |
| - logger.warning(f"Delta table {table_full_name} is corrupted: missing transaction log.") |
| 81 | + logger.warning(f"Delta table {table.key} is corrupt: missing transaction log.") |
79 | 82 | return None
|
80 |
| - logger.error(f"Failed to evaluate {table_full_name} table size: ", exc_info=True) |
| 83 | + logger.error(f"Failed to evaluate {table.key} table size: ", exc_info=True) |
81 | 84 |
|
82 | 85 | return None
|
0 commit comments