|
145 | 145 | import org.apache.iceberg.FileFormat;
|
146 | 146 | import org.apache.iceberg.FileMetadata;
|
147 | 147 | import org.apache.iceberg.FileScanTask;
|
| 148 | +import org.apache.iceberg.IcebergManifestUtils; |
148 | 149 | import org.apache.iceberg.IsolationLevel;
|
149 | 150 | import org.apache.iceberg.ManifestFile;
|
150 | 151 | import org.apache.iceberg.ManifestFiles;
|
@@ -2260,11 +2261,18 @@ private void removeOrphanFiles(Table table, ConnectorSession session, SchemaTabl
|
2260 | 2261 | ImmutableSet.Builder<String> validDataFileNames = ImmutableSet.builder();
|
2261 | 2262 |
|
2262 | 2263 | for (Snapshot snapshot : table.snapshots()) {
|
2263 |
| - if (snapshot.manifestListLocation() != null) { |
2264 |
| - validMetadataFileNames.add(fileName(snapshot.manifestListLocation())); |
| 2264 | + String manifestListLocation = snapshot.manifestListLocation(); |
| 2265 | + List<ManifestFile> allManifests; |
| 2266 | + if (manifestListLocation != null) { |
| 2267 | + validMetadataFileNames.add(fileName(manifestListLocation)); |
| 2268 | + allManifests = loadAllManifestsFromManifestList(table, manifestListLocation); |
| 2269 | + } |
| 2270 | + else { |
| 2271 | + // This is to maintain support for V1 tables which have embedded manifest lists |
| 2272 | + allManifests = loadAllManifestsFromSnapshot(table, snapshot); |
2265 | 2273 | }
|
2266 | 2274 |
|
2267 |
| - for (ManifestFile manifest : loadAllManifestsFromSnapshot(table, snapshot)) { |
| 2275 | + for (ManifestFile manifest : allManifests) { |
2268 | 2276 | if (!processedManifestFilePaths.add(manifest.path())) {
|
2269 | 2277 | // Already read this manifest
|
2270 | 2278 | continue;
|
@@ -3469,6 +3477,21 @@ private static List<ManifestFile> loadAllManifestsFromSnapshot(Table icebergTabl
|
3469 | 3477 | }
|
3470 | 3478 | }
|
3471 | 3479 |
|
| 3480 | + /** |
| 3481 | + * Use instead of loadAllManifestsFromSnapshot when loading manifests from multiple distinct snapshots |
| 3482 | + * Each BaseSnapshot object caches manifest files separately, so loading manifests from multiple distinct snapshots |
| 3483 | + * results in O(num_snapshots^2) copies of the same manifest file metadata in memory |
| 3484 | + */ |
| 3485 | + private static List<ManifestFile> loadAllManifestsFromManifestList(Table icebergTable, String manifestListLocation) |
| 3486 | + { |
| 3487 | + try { |
| 3488 | + return IcebergManifestUtils.read(icebergTable.io(), manifestListLocation); |
| 3489 | + } |
| 3490 | + catch (NotFoundException | UncheckedIOException e) { |
| 3491 | + throw new TrinoException(ICEBERG_INVALID_METADATA, "Error accessing manifest file for table %s".formatted(icebergTable.name()), e); |
| 3492 | + } |
| 3493 | + } |
| 3494 | + |
3472 | 3495 | private static Set<Integer> identityPartitionColumnsInAllSpecs(Table table)
|
3473 | 3496 | {
|
3474 | 3497 | // Extract identity partition column source ids common to ALL specs
|
|
0 commit comments