Skip to content

Commit cd57579

Browse files
authored
Delete temporary files when running solacc (#2750)
## Changes solacc.py currently lints the entire solacc repo, thus accumulating temporary files to a point that exceeds CI storage capacity This PR fixes the issue by: - lint the repo on a per top-level solacc 'solution' (within the repo, top folders are independent of each other) - delete temp files and dirs registered in PathLookup after linting a solution This PR also prepares for improving false positive detection ### Linked issues None ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
1 parent 44e669b commit cd57579

File tree

3 files changed

+113
-62
lines changed

3 files changed

+113
-62
lines changed

src/databricks/labs/ucx/source_code/python/python_infer.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
Const,
1111
decorators,
1212
Dict,
13-
FormattedValue,
1413
Instance,
15-
JoinedStr,
1614
Name,
1715
NodeNG,
1816
Uninferable,

tests/integration/source_code/solacc.py

Lines changed: 112 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import logging
22
import os
3+
import shutil
34
import sys
5+
from dataclasses import dataclass, field
46
from pathlib import Path
57

68
import requests
@@ -12,6 +14,7 @@
1214
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
1315
from databricks.labs.ucx.source_code.base import LocatedAdvice
1416
from databricks.labs.ucx.source_code.linters.context import LinterContext
17+
from databricks.labs.ucx.source_code.path_lookup import PathLookup
1518

1619
logger = logging.getLogger("verify-accelerators")
1720

@@ -54,7 +57,7 @@ def collect_missing_imports(advices: list[LocatedAdvice]):
5457
return missing_imports
5558

5659

57-
def collect_not_computed(advices: list[LocatedAdvice]):
60+
def collect_uninferrable_count(advices: list[LocatedAdvice]):
5861
not_computed = 0
5962
for located_advice in advices:
6063
if "computed" in located_advice.advice.message:
@@ -68,93 +71,143 @@ def print_advices(advices: list[LocatedAdvice], file: Path):
6871
sys.stdout.write(f"{message}\n")
6972

7073

71-
def lint_one(file: Path, ctx: LocalCheckoutContext, unparsed: Path | None) -> tuple[set[str], int, int]:
74+
@dataclass
75+
class SolaccContext:
76+
unparsed_path: Path | None = None
77+
files_to_skip: set[str] | None = None
78+
total_count = 0
79+
parseable_count = 0
80+
uninferrable_count = 0
81+
missing_imports: dict[str, dict[str, int]] = field(default_factory=dict)
82+
83+
@classmethod
84+
def create(cls, for_all_dirs: bool):
85+
unparsed_path: Path | None = None
86+
# if lint_all, recreate "solacc-unparsed.txt"
87+
if for_all_dirs:
88+
unparsed_path = Path(Path(__file__).parent, "solacc-unparsed.txt")
89+
if unparsed_path.exists():
90+
os.remove(unparsed_path)
91+
files_to_skip: set[str] | None = None
92+
malformed = Path(__file__).parent / "solacc-malformed.txt"
93+
if for_all_dirs and malformed.exists():
94+
lines = malformed.read_text(encoding="utf-8").split("\n")
95+
files_to_skip = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
96+
return SolaccContext(unparsed_path=unparsed_path, files_to_skip=files_to_skip)
97+
98+
def register_missing_import(self, missing_import: str):
99+
prefix = missing_import.split(".")[0]
100+
details = self.missing_imports.get(prefix, None)
101+
if details is None:
102+
details = {}
103+
self.missing_imports[prefix] = details
104+
count = details.get(missing_import, 0)
105+
details[missing_import] = count + 1
106+
107+
def log_missing_imports(self):
108+
missing_imports = dict(
109+
sorted(self.missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True)
110+
)
111+
for prefix, details in missing_imports.items():
112+
logger.info(f"Missing import '{prefix}'")
113+
for item, count in details.items():
114+
logger.info(f" {item}: {count} occurrences")
115+
116+
117+
def lint_one(solacc: SolaccContext, file: Path, ctx: LocalCheckoutContext) -> None:
72118
try:
73119
advices = list(ctx.local_code_linter.lint_path(file, set()))
74-
missing_imports = collect_missing_imports(advices)
75-
not_computed = collect_not_computed(advices)
120+
solacc.parseable_count += 1
121+
for missing_import in collect_missing_imports(advices):
122+
solacc.register_missing_import(missing_import)
123+
solacc.uninferrable_count += collect_uninferrable_count(advices)
76124
print_advices(advices, file)
77-
return missing_imports, 1, not_computed
78125
except Exception as e: # pylint: disable=broad-except
79126
# here we're most likely catching astroid & sqlglot errors
80-
if unparsed is None: # linting single file, log exception details
81-
logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "), exc_info=e)
82-
else:
127+
# when linting single file, log exception details
128+
logger.error(
129+
f"Error during parsing of {file}: {e}".replace("\n", " "),
130+
exc_info=e if solacc.unparsed_path is None else None,
131+
)
132+
if solacc.unparsed_path:
83133
logger.error(f"Error during parsing of {file}: {e}".replace("\n", " "))
84134
# populate solacc-unparsed.txt
85-
with unparsed.open(mode="a", encoding="utf-8") as f:
135+
with solacc.unparsed_path.open(mode="a", encoding="utf-8") as f:
86136
f.write(file.relative_to(dist).as_posix())
87137
f.write("\n")
88-
return set(), 0, 0
89138

90139

91-
def lint_all(file_to_lint: str | None):
140+
class _CleanablePathLookup(PathLookup):
141+
142+
def __init__(self):
143+
super().__init__(Path.cwd(), [Path(path) for path in sys.path])
144+
self._original_sys_paths = set(self._sys_paths)
145+
146+
def clean_tmp_sys_paths(self):
147+
for path in self._sys_paths:
148+
if path in self._original_sys_paths:
149+
continue
150+
if path.is_file():
151+
path.unlink()
152+
if path.is_dir():
153+
shutil.rmtree(path)
154+
155+
156+
def lint_dir(solacc: SolaccContext, soldir: Path, file_to_lint: str | None = None):
157+
path_lookup = _CleanablePathLookup()
92158
ws = WorkspaceClient(host='...', token='...')
93159
ctx = LocalCheckoutContext(ws).replace(
94-
linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state)
160+
linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state),
161+
path_lookup=path_lookup,
95162
)
96-
parseable = 0
97-
not_computed = 0
98-
missing_imports: dict[str, dict[str, int]] = {}
99-
all_files = list(dist.glob('**/*.py')) if file_to_lint is None else [Path(dist, file_to_lint)]
100-
unparsed: Path | None = None
101-
if file_to_lint is None:
102-
unparsed = Path(Path(__file__).parent, "solacc-unparsed.txt")
103-
if unparsed.exists():
104-
os.remove(unparsed)
105-
skipped: set[str] | None = None
106-
malformed = Path(__file__).parent / "solacc-malformed.txt"
107-
if file_to_lint is None and malformed.exists():
108-
lines = malformed.read_text(encoding="utf-8").split("\n")
109-
skipped = set(line for line in lines if len(line) > 0 and not line.startswith("#"))
163+
all_files = list(soldir.glob('**/*.py')) if file_to_lint is None else [Path(soldir, file_to_lint)]
164+
solacc.total_count += len(all_files)
110165
for file in all_files:
111-
if skipped and file.relative_to(dist).as_posix() in skipped:
166+
if solacc.files_to_skip and file.relative_to(dist).as_posix() in solacc.files_to_skip:
112167
continue
113-
_missing_imports, _parseable, _not_computed = lint_one(file, ctx, unparsed)
114-
for _import in _missing_imports:
115-
register_missing_import(missing_imports, _import)
116-
parseable += _parseable
117-
not_computed += _not_computed
118-
all_files_len = len(all_files) - (len(skipped) if skipped else 0)
119-
parseable_pct = int(parseable / all_files_len * 100)
120-
missing_imports_count = sum(sum(details.values()) for details in missing_imports.values())
168+
lint_one(solacc, file, ctx)
169+
# cleanup tmp dirs
170+
path_lookup.clean_tmp_sys_paths()
171+
172+
173+
def lint_file(file_to_lint: str):
174+
solacc = SolaccContext.create(False)
175+
file_path = Path(file_to_lint)
176+
lint_dir(solacc, file_path.parent, file_path.name)
177+
178+
179+
def lint_all():
180+
solacc = SolaccContext.create(True)
181+
for soldir in os.listdir(dist):
182+
lint_dir(solacc, dist / soldir)
183+
all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0)
184+
parseable_pct = int(solacc.parseable_count / all_files_len * 100)
185+
missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values())
121186
logger.info(
122-
f"Skipped: {len(skipped or [])}, parseable: {parseable_pct}% ({parseable}/{all_files_len}), missing imports: {missing_imports_count}, not computed: {not_computed}"
187+
f"Skipped: {len(solacc.files_to_skip or [])}, "
188+
f"parseable: {parseable_pct}% ({solacc.parseable_count}/{all_files_len}), "
189+
f"missing imports: {missing_imports_count}, "
190+
f"not computed: {solacc.uninferrable_count}"
123191
)
124-
log_missing_imports(missing_imports)
192+
solacc.log_missing_imports()
125193
# fail the job if files are unparseable
126194
if parseable_pct < 100:
127195
sys.exit(1)
128196

129197

130-
def register_missing_import(missing_imports: dict[str, dict[str, int]], missing_import: str):
131-
prefix = missing_import.split(".")[0]
132-
details = missing_imports.get(prefix, None)
133-
if details is None:
134-
details = {}
135-
missing_imports[prefix] = details
136-
count = details.get(missing_import, 0)
137-
details[missing_import] = count + 1
138-
139-
140-
def log_missing_imports(missing_imports: dict[str, dict[str, int]]):
141-
missing_imports = dict(sorted(missing_imports.items(), key=lambda item: sum(item[1].values()), reverse=True))
142-
for prefix, details in missing_imports.items():
143-
logger.info(f"Missing import '{prefix}'")
144-
for item, count in details.items():
145-
logger.info(f" {item}: {count} occurrences")
146-
147-
148198
def main(args: list[str]):
149199
install_logger()
150200
logging.root.setLevel(logging.INFO)
151201
file_to_lint = args[1] if len(args) > 1 else None
152-
if not file_to_lint:
202+
if file_to_lint:
153203
# don't clone if linting just one file, assumption is we're troubleshooting
154-
logger.info("Cloning...")
155-
clone_all()
204+
logger.info("Linting...")
205+
lint_file(file_to_lint)
206+
return
207+
logger.info("Cloning...")
208+
clone_all()
156209
logger.info("Linting...")
157-
lint_all(file_to_lint)
210+
lint_all()
158211

159212

160213
if __name__ == "__main__":

tests/unit/source_code/test_notebook.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_notebook_builds_python_dependency_graph_with_fstring_loop(mock_path_loo
249249
container = maybe.dependency.load(mock_path_lookup)
250250
assert container is not None
251251
container.build_dependency_graph(graph)
252-
expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"]
252+
expected_paths = [path, "leaf1.py", "leaf2.py", "leaf3.py"]
253253
all_paths = set(d.path for d in graph.all_dependencies)
254254
assert all_paths == {mock_path_lookup.cwd / path for path in expected_paths}
255255

0 commit comments

Comments
 (0)