From 152d21f6c22e48d449bbcf2ac46ca527426fbadf Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 15 May 2023 21:02:19 +0200 Subject: [PATCH 1/6] add specification for bzip3 --- archive/bzip3.ksy | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 archive/bzip3.ksy diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy new file mode 100644 index 000000000..081af2406 --- /dev/null +++ b/archive/bzip3.ksy @@ -0,0 +1,35 @@ +meta: + id: bzip3 + title: Bzip3 header + file-extension: bz3 + license: LGPL-3.0 + encoding: UTF-8 + endian: le +doc-ref: https://github.com/kspalaiologos/bzip3 +seq: + - id: header + type: header + - id: blocks + type: compressed_data_block + repeat: until + repeat-until: _io.eof or _.is_last +types: + header: + seq: + - id: signature + contents: 'BZ3v1' + - id: block_size + type: u4 + compressed_data_block: + seq: + - id: len_compressed + type: u4 + - id: len_uncompressed + type: u4 + valid: + max: _root.header.block_size + - id: data + size: len_compressed + instances: + is_last: + value: len_uncompressed < _root.header.block_size From 41fef77ddea72653a64cd48ffae0e4dd59be4ca4 Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 12 May 2025 14:26:56 +0200 Subject: [PATCH 2/6] bizp3: add reference, add doc string, add validation check --- archive/bzip3.ksy | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy index 081af2406..8ac3dd54e 100644 --- a/archive/bzip3.ksy +++ b/archive/bzip3.ksy @@ -5,7 +5,14 @@ meta: license: LGPL-3.0 encoding: UTF-8 endian: le -doc-ref: https://github.com/kspalaiologos/bzip3 +doc: + bzip3 is a decompression tool and library. Depending on whether or not the + CLI or the library is used the file format is slightly different. This + specification describes the structure of the format as generated by the CLI + (header and chunks, not blocks). +doc-ref: + - https://github.com/kspalaiologos/bzip3 + - https://github.com/kspalaiologos/bzip3/blob/972e6694b815/doc/bzip3_format.md seq: - id: header type: header @@ -20,6 +27,9 @@ types: contents: 'BZ3v1' - id: block_size type: u4 + valid: + min: 66560 + max: 535822336 compressed_data_block: seq: - id: len_compressed From 4d731475b2675fe7cd7295613c87c51def0a4310 Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 12 May 2025 14:58:45 +0200 Subject: [PATCH 3/6] bzip3: change license, remove encoding, various cosmetic fixes, block_size -> max_block_size --- archive/bzip3.ksy | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy index 8ac3dd54e..de97f42f1 100644 --- a/archive/bzip3.ksy +++ b/archive/bzip3.ksy @@ -2,8 +2,7 @@ meta: id: bzip3 title: Bzip3 header file-extension: bz3 - license: LGPL-3.0 - encoding: UTF-8 + license: CC0-1.0 endian: le doc: bzip3 is a decompression tool and library. Depending on whether or not the @@ -25,11 +24,11 @@ types: seq: - id: signature contents: 'BZ3v1' - - id: block_size + - id: max_block_size type: u4 valid: - min: 66560 - max: 535822336 + min: 66_560 # 65 KiB + max: 535_822_336 # 511 MiB compressed_data_block: seq: - id: len_compressed @@ -37,9 +36,9 @@ types: - id: len_uncompressed type: u4 valid: - max: _root.header.block_size + max: _root.header.max_block_size - id: data size: len_compressed instances: is_last: - value: len_uncompressed < _root.header.block_size + value: len_uncompressed < _root.header.max_block_size From 3338f257556506075df33f72ea4adcaa1604ad0b Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 12 May 2025 16:05:19 +0200 Subject: [PATCH 4/6] bzip3: fix length check, explain is_last instance --- archive/bzip3.ksy | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy index de97f42f1..966cbd6fc 100644 --- a/archive/bzip3.ksy +++ b/archive/bzip3.ksy @@ -9,6 +9,24 @@ doc: CLI or the library is used the file format is slightly different. This specification describes the structure of the format as generated by the CLI (header and chunks, not blocks). + + bzip3 assumes that the entire file is a bzip3 archive and will fail as there + is no reliable way to detect if data following a chunk is a next chunk or if + it is other data (like what happens in concatenated files). + + A simple example to illustrate: + + $ cp /bin/ls . + $ bzip3 ls + $ cat ls.bz3 ls > test.bz3 + $ bz3cat test.bz3 > /dev/null + Failed to decode a block: Inconsistent headers. + + If a potential chunk has an invalid length (len_compressed), then the end of + the file has been reached. This check will not always work as it is possible + to have data where the first four bytes will be a valid length. With + additional block parsing and CRC checks it will be possible to detect invalid + blocks. This is future work. doc-ref: - https://github.com/kspalaiologos/bzip3 - https://github.com/kspalaiologos/bzip3/blob/972e6694b815/doc/bzip3_format.md From 03ad5f51971da6d239699bb7b4c7eb6a0df91605 Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 12 May 2025 16:10:57 +0200 Subject: [PATCH 5/6] bzip3: fix doc --- archive/bzip3.ksy | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy index 966cbd6fc..e8a64d134 100644 --- a/archive/bzip3.ksy +++ b/archive/bzip3.ksy @@ -4,7 +4,7 @@ meta: file-extension: bz3 license: CC0-1.0 endian: le -doc: +doc: | bzip3 is a decompression tool and library. Depending on whether or not the CLI or the library is used the file format is slightly different. This specification describes the structure of the format as generated by the CLI @@ -16,17 +16,17 @@ doc: A simple example to illustrate: - $ cp /bin/ls . - $ bzip3 ls - $ cat ls.bz3 ls > test.bz3 - $ bz3cat test.bz3 > /dev/null - Failed to decode a block: Inconsistent headers. + $ cp /bin/ls . + $ bzip3 ls + $ cat ls.bz3 ls > test.bz3 + $ bz3cat test.bz3 > /dev/null + Failed to decode a block: Inconsistent headers. - If a potential chunk has an invalid length (len_compressed), then the end of - the file has been reached. This check will not always work as it is possible - to have data where the first four bytes will be a valid length. With - additional block parsing and CRC checks it will be possible to detect invalid - blocks. This is future work. + If a potential chunk has an invalid length (len_compressed), then the end of + the file has been reached. This check will not always work as it is possible + to have data where the first four bytes will be a valid length. With + additional block parsing and CRC checks it will be possible to detect invalid + blocks. This is future work. doc-ref: - https://github.com/kspalaiologos/bzip3 - https://github.com/kspalaiologos/bzip3/blob/972e6694b815/doc/bzip3_format.md From f33c2af5855e7ea7d60dc243775668ec3c82f8d6 Mon Sep 17 00:00:00 2001 From: Armijn Hemel Date: Mon, 12 May 2025 16:12:14 +0200 Subject: [PATCH 6/6] bzip3: fix length check --- archive/bzip3.ksy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archive/bzip3.ksy b/archive/bzip3.ksy index e8a64d134..dab3cfdca 100644 --- a/archive/bzip3.ksy +++ b/archive/bzip3.ksy @@ -59,4 +59,4 @@ types: size: len_compressed instances: is_last: - value: len_uncompressed < _root.header.max_block_size + value: len_uncompressed <= _root.header.max_block_size