Task 072: .BZ2 File Format
Task 072: .BZ2 File Format
File Format Specifications for .BZ2
The .BZ2 file format is associated with bzip2, a block-sorting compression algorithm developed by Julian Seward. It is designed for compressing single files using the Burrows-Wheeler transform, run-length encoding, and Huffman coding. The format supports block sizes ranging from 100 KB to 900 KB and is commonly used on Unix-like systems. There is no official formal specification, but reverse-engineered descriptions are available from sources such as the Library of Congress digital formats documentation and community wikis. A bzip2 stream begins with a 4-byte header, followed by zero or more bit-aligned compressed blocks, and concludes with an end-of-stream marker including a 32-bit CRC checksum. The format does not include metadata such as filenames or timestamps, focusing solely on data compression.
List of Properties Intrinsic to the .BZ2 File Format
The following properties are derived from the file format's structure and are intrinsic to its organization. These include elements from the header and stream-level checksum, which can be extracted without full decompression:
- Magic signature: The first two bytes, always 'BZ' (hex: 42 5A), identifying the format.
- Version identifier: The third byte, 'h' (hex: 68), indicating Huffman coding.
- Block size multiplier: The fourth byte, a digit from '1' to '9' (hex: 31 to 39), representing the block size as 100 KB × multiplier (e.g., '9' for 900 KB).
- Stream CRC: A 32-bit cyclic redundancy check (CRC) of the uncompressed data, located near the end of the file after the end-of-stream magic number (hex: 177245385090), bit-aligned with possible padding.
Two Direct Download Links for .BZ2 Files
- https://ftp.gnu.org/gnu/hello/hello-2.12.tar.bz2 (GNU Hello program source code, compressed with bzip2).
- https://ftp.gnu.org/gnu/sed/sed-4.9.tar.bz2 (GNU sed utility source code, compressed with bzip2).
Ghost Blog Embedded HTML/JavaScript for Drag-and-Drop .BZ2 Property Dump
The following is a self-contained HTML page with embedded JavaScript. It provides a drag-and-drop interface where users can drop a .BZ2 file. The script parses the file, extracts the listed properties (including handling bit alignment for the stream CRC), and displays them on the screen. Save this as an HTML file and open it in a modern browser.
Drag and Drop .BZ2 File
Python Class for Handling .BZ2 Files
The following Python class opens a .BZ2 file, decodes the properties, reads and prints them to the console. For writing, it includes a method to create a new .BZ2 file by compressing provided data (using the built-in bz2
module for compression, as full from-scratch implementation exceeds scope).
import bz2
class Bz2Handler:
def __init__(self):
self.magic = None
self.version = None
self.block_multiplier = None
self.stream_crc = None
def open_and_decode(self, filename):
with open(filename, 'rb') as f:
data = f.read()
if len(data) < 14:
raise ValueError("File too small for .BZ2 format.")
# Decode header
self.magic = chr(data[0]) + chr(data[1])
self.version = chr(data[2])
self.block_multiplier = int(chr(data[3]))
# Decode stream CRC from last 10 bytes
last10 = data[-10:]
bit_string = ''.join(bin(byte)[2:].zfill(8) for byte in last10)
end_magic = 0x177245385090
for shift in range(8):
magic_bits = int(bit_string[shift:shift+48], 2)
if magic_bits == end_magic:
crc_bits = bit_string[shift+48:shift+80]
self.stream_crc = hex(int(crc_bits, 2))[2:].upper().zfill(8)
break
else:
raise ValueError("Stream end marker not found.")
def print_properties(self):
print(f"Magic: {self.magic}")
print(f"Version: {self.version}")
print(f"Block Size Multiplier: {self.block_multiplier}")
print(f"Stream CRC: 0x{self.stream_crc}")
def write_new_file(self, output_filename, uncompressed_data, compress_level=9):
compressed = bz2.compress(uncompressed_data, compresslevel=compress_level)
with open(output_filename, 'wb') as f:
f.write(compressed)
# Example usage:
# handler = Bz2Handler()
# handler.open_and_decode('example.bz2')
# handler.print_properties()
# handler.write_new_file('new.bz2', b'Hello, world!')
Java Class for Handling .BZ2 Files
The following Java class opens a .BZ2 file, decodes the properties, reads and prints them to the console. For writing, it uses Apache Commons Compress (assuming availability) to create a new .BZ2 file.
import java.io.*;
import java.math.BigInteger;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; // Requires Apache Commons Compress library
public class Bz2Handler {
private String magic;
private String version;
private int blockMultiplier;
private String streamCrc;
public void openAndDecode(String filename) throws IOException {
File file = new File(filename);
byte[] data = new byte[(int) file.length()];
try (FileInputStream fis = new FileInputStream(file)) {
fis.read(data);
}
if (data.length < 14) {
throw new IOException("File too small for .BZ2 format.");
}
// Decode header
magic = "" + (char) data[0] + (char) data[1];
version = "" + (char) data[2];
blockMultiplier = Integer.parseInt("" + (char) data[3]);
// Decode stream CRC from last 10 bytes
byte[] last10 = new byte[10];
System.arraycopy(data, data.length - 10, last10, 0, 10);
StringBuilder bitString = new StringBuilder();
for (byte b : last10) {
bitString.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(' ', '0'));
}
BigInteger endMagic = new BigInteger("177245385090", 16);
for (int shift = 0; shift < 8; shift++) {
BigInteger magicBits = new BigInteger(bitString.substring(shift, shift + 48), 2);
if (magicBits.equals(endMagic)) {
String crcBits = bitString.substring(shift + 48, shift + 80);
streamCrc = new BigInteger(crcBits, 2).toString(16).toUpperCase();
while (streamCrc.length() < 8) streamCrc = "0" + streamCrc;
break;
}
}
if (streamCrc == null) {
throw new IOException("Stream end marker not found.");
}
}
public void printProperties() {
System.out.println("Magic: " + magic);
System.out.println("Version: " + version);
System.out.println("Block Size Multiplier: " + blockMultiplier);
System.out.println("Stream CRC: 0x" + streamCrc);
}
public void writeNewFile(String outputFilename, byte[] uncompressedData, int compressLevel) throws IOException {
try (FileOutputStream fos = new FileOutputStream(outputFilename);
BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(fos, compressLevel)) {
bzos.write(uncompressedData);
}
}
// Example usage:
// public static void main(String[] args) throws IOException {
// Bz2Handler handler = new Bz2Handler();
// handler.openAndDecode("example.bz2");
// handler.printProperties();
// handler.writeNewFile("new.bz2", "Hello, world!".getBytes(), 9);
// }
}
JavaScript Class for Handling .BZ2 Files
The following JavaScript class (for Node.js) opens a .BZ2 file, decodes the properties, reads and prints them to the console. For writing, it uses the bz2
npm module (assuming installation) to create a new .BZ2 file.
const fs = require('fs');
// Requires 'bz2' module for writing: npm install bz2
class Bz2Handler {
constructor() {
this.magic = null;
this.version = null;
this.blockMultiplier = null;
this.streamCrc = null;
}
openAndDecode(filename) {
const data = fs.readFileSync(filename);
if (data.length < 14) {
throw new Error('File too small for .BZ2 format.');
}
// Decode header
this.magic = String.fromCharCode(data[0]) + String.fromCharCode(data[1]);
this.version = String.fromCharCode(data[2]);
this.blockMultiplier = parseInt(String.fromCharCode(data[3]), 10);
// Decode stream CRC from last 10 bytes
const last10 = data.slice(-10);
let bitString = '';
for (let byte of last10) {
bitString += byte.toString(2).padStart(8, '0');
}
const endMagic = BigInt(0x177245385090n);
for (let shift = 0; shift < 8; shift++) {
const magicBits = BigInt('0b' + bitString.substring(shift, shift + 48));
if (magicBits === endMagic) {
const crcBits = bitString.substring(shift + 48, shift + 80);
this.streamCrc = parseInt(crcBits, 2).toString(16).toUpperCase().padStart(8, '0');
break;
}
}
if (!this.streamCrc) {
throw new Error('Stream end marker not found.');
}
}
printProperties() {
console.log(`Magic: ${this.magic}`);
console.log(`Version: ${this.version}`);
console.log(`Block Size Multiplier: ${this.blockMultiplier}`);
console.log(`Stream CRC: 0x${this.streamCrc}`);
}
writeNewFile(outputFilename, uncompressedData, compressLevel = 9) {
const bz2 = require('bz2');
const compressed = bz2.compress(uncompressedData, compressLevel);
fs.writeFileSync(outputFilename, compressed);
}
}
// Example usage:
// const handler = new Bz2Handler();
// handler.openAndDecode('example.bz2');
// handler.printProperties();
// handler.writeNewFile('new.bz2', Buffer.from('Hello, world!'), 9);
C++ Class for Handling .BZ2 Files
The following C++ class opens a .BZ2 file, decodes the properties, reads and prints them to the console. For writing, it uses the libbzip2 library (assuming linkage) to create a new .BZ2 file.
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <bitset>
#include <bzlib.h> // Link with -lbz2 for writing
class Bz2Handler {
private:
std::string magic;
std::string version;
int blockMultiplier;
std::string streamCrc;
public:
void openAndDecode(const std::string& filename) {
std::ifstream file(filename, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file.");
}
file.seekg(0, std::ios::end);
size_t size = file.tellg();
file.seekg(0, std::ios::beg);
if (size < 14) {
throw std::runtime_error("File too small for .BZ2 format.");
}
std::vector<char> data(size);
file.read(data.data(), size);
// Decode header
magic = std::string(1, data[0]) + data[1];
version = std::string(1, data[2]);
blockMultiplier = data[3] - '0';
// Decode stream CRC from last 10 bytes
std::bitset<80> bits;
for (int i = 0; i < 10; ++i) {
std::bitset<8> byte(static_cast<unsigned char>(data[size - 10 + i]));
for (int j = 0; j < 8; ++j) {
bits[i * 8 + j] = byte[7 - j]; // MSB first
}
}
unsigned long long endMagic = 0x177245385090ULL;
for (int shift = 0; shift < 8; ++shift) {
unsigned long long magicBits = (bits >> (80 - 48 - shift)).to_ullong() & ((1ULL << 48) - 1);
if (magicBits == endMagic) {
unsigned long long crcBits = (bits >> (80 - 48 - 32 - shift)).to_ullong() & ((1ULL << 32) - 1);
char buf[9];
sprintf(buf, "%08X", static_cast<unsigned int>(crcBits));
streamCrc = buf;
break;
}
}
if (streamCrc.empty()) {
throw std::runtime_error("Stream end marker not found.");
}
}
void printProperties() {
std::cout << "Magic: " << magic << std::endl;
std::cout << "Version: " << version << std::endl;
std::cout << "Block Size Multiplier: " << blockMultiplier << std::endl;
std::cout << "Stream CRC: 0x" << streamCrc << std::endl;
}
void writeNewFile(const std::string& outputFilename, const char* uncompressedData, size_t dataSize, int compressLevel = 9) {
FILE* outFile = fopen(outputFilename.c_str(), "wb");
if (!outFile) {
throw std::runtime_error("Cannot open output file.");
}
BZFILE* bzFile = BZ2_bzWriteOpen(nullptr, outFile, compressLevel, 0, 0);
if (!bzFile) {
fclose(outFile);
throw std::runtime_error("Failed to open BZ2 writer.");
}
BZ2_bzWrite(&bzFile, const_cast<char*>(uncompressedData), dataSize);
BZ2_bzWriteClose(nullptr, bzFile, 0, nullptr, nullptr);
fclose(outFile);
}
};
// Example usage:
// int main() {
// Bz2Handler handler;
// try {
// handler.openAndDecode("example.bz2");
// handler.printProperties();
// const char* data = "Hello, world!";
// handler.writeNewFile("new.bz2", data, strlen(data), 9);
// } catch (const std::exception& e) {
// std::cerr << e.what() << std::endl;
// }
// return 0;
// }