Task 277: .GZ File Format
Task 277: .GZ File Format
1. Properties of the .GZ File Format
The .GZ file format (GZIP, version 4.3) is a lossless compression format defined in RFC 1952. It consists of one or more concatenated "members," each with a fixed 10-byte header, optional fields, compressed data (using DEFLATE), and a 8-byte trailer. Below is a complete list of all intrinsic properties (fields and their attributes) in the format, focusing on structural elements independent of the underlying file system:
- Magic Number (ID1 and ID2): 2 bytes (fixed values: 0x1F followed by 0x8B), identifies the file as GZIP.
- Compression Method (CM): 1 byte (typically 8 for DEFLATE; values 0-7 reserved).
- Flags (FLG): 1 byte bitfield:
- Bit 0 (FTEXT): 1 if the original data is likely ASCII text.
- Bit 1 (FHCRC): 1 if a 2-byte header CRC16 follows the fixed header.
- Bit 2 (FEXTRA): 1 if an extra field is present.
- Bit 3 (FNAME): 1 if an original filename string follows.
- Bit 4 (FCOMMENT): 1 if a comment string follows.
- Bits 5-7: Reserved (must be 0).
- Modification Time (MTIME): 4 bytes (Unix timestamp in seconds since 1970-01-01 00:00:00 UTC; 0 if unknown).
- Extra Flags (XFL): 1 byte (for DEFLATE: 2 = maximum compression, 4 = fastest compression; other values implementation-specific).
- Operating System (OS): 1 byte (identifies the OS where compression occurred: 0=FAT/MS-DOS, 1=Amiga, 2=VMS, 3=Unix, 4=VM/CMS, 5=Atari TOS, 6=HPFS, 7=Macintosh, 8=Z-System, 9=CP/M, 10=TOPS-20, 11=NTFS, 12=QDOS, 13=Acorn RISCOS, 255=unknown).
- Header CRC16: 2 bytes (CRC-16 checksum of the 10-byte fixed header; present only if FLG.FHCRC is set).
- Extra Field Length (XLEN): 2 bytes (length of the extra field in bytes; present only if FLG.FEXTRA is set).
- Extra Fields: Variable length (subfields with 1-byte SI1, 1-byte SI2 identifiers, 2-byte LEN, and LEN bytes of data; present only if FLG.FEXTRA is set; e.g., SI1=0x41, SI2=0x70 for Apollo file type).
- Original Filename: Variable-length NUL-terminated string (in ISO-8859-1 encoding, lowercase if from case-insensitive FS; present only if FLG.FNAME is set; empty if data from non-file source).
- File Comment: Variable-length NUL-terminated string (in ISO-8859-1 encoding, for human-readable notes; present only if FLG.FCOMMENT is set).
- CRC32: 4 bytes (32-bit CRC of the uncompressed original data).
- ISIZE: 4 bytes (length of the uncompressed original data, modulo 2^32).
These properties are read sequentially after the fixed header, followed by variable-length compressed data (not listed as a printable property here, as it is opaque binary).
2. Two Direct Download Links for .GZ Files
- Sample amendments.txt.gz: https://www.fileformat.info/format/gzip/sample/9b070e2c4e77494eb1b2f4cf34a73870/download
- Sample constitution.txt.gz: https://www.fileformat.info/format/gzip/sample/e119d9a8ef8f4ea8bd22d95c90470a21/download
3. Ghost Blog Embedded HTML JavaScript
Embed the following self-contained HTML snippet into a Ghost blog post (use the HTML card in the editor). It creates a drag-and-drop zone for .GZ files. Upon drop, it parses the header using the File API and dumps all properties to a <pre>
block below.
4. Python Class
import struct
class GZParser:
def __init__(self, filename):
with open(filename, 'rb') as f:
self.data = f.read()
self.parse()
def parse(self):
if len(self.data) < 10 or self.data[0] != 0x1F or self.data[1] != 0x8B:
print("Invalid GZIP file")
return
pos = 2
cm, flg = struct.unpack('BB', self.data[pos:pos+2]); pos += 2
mtime, = struct.unpack('>I', self.data[pos:pos+4]); pos += 4
xfl, os = struct.unpack('BB', self.data[pos:pos+2]); pos += 2
print(f"Magic: 0x1F 0x8B")
print(f"CM: {cm} (deflate if 8)")
print(f"FLG: {flg:02x} (FTEXT:{flg&1}, FHCRC:{(flg>>1)&1}, FEXTRA:{(flg>>2)&1}, FNAME:{(flg>>3)&1}, FCOMMENT:{(flg>>4)&1})")
print(f"MTIME: {mtime} (Unix ts)")
print(f"XFL: {xfl}")
print(f"OS: {os}")
if flg & 2: # FHCRC
hcrc, = struct.unpack('>H', self.data[pos:pos+2]); pos += 2
print(f"Header CRC16: {hcrc:04x}")
if flg & 4: # FEXTRA
xlen, = struct.unpack('>H', self.data[pos:pos+2]); pos += 2
print(f"XLEN: {xlen}")
extra = self.data[pos:pos+xlen]
print(f"Extra Fields: {' '.join(f'{b:02x}' for b in extra)}")
pos += xlen
if flg & 8: # FNAME
fname_end = self.data.find(b'\x00', pos)
fname = self.data[pos:fname_end].decode('latin-1')
pos = fname_end + 1
print(f"Filename: {fname}")
if flg & 16: # FCOMMENT
comment_end = self.data.find(b'\x00', pos)
comment = self.data[pos:comment_end].decode('latin-1')
pos = comment_end + 1
print(f"Comment: {comment}")
# Trailer
pos = len(self.data) - 8
crc32, isize = struct.unpack('>II', self.data[pos:pos+8])
print(f"CRC32: {crc32:08x}")
print(f"ISIZE: {isize}")
# Usage: parser = GZParser('example.gz')
5. Java Class
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
public class GZParser {
private byte[] data;
public GZParser(String filename) throws IOException {
FileInputStream fis = new FileInputStream(filename);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = fis.read(buffer)) != -1) {
baos.write(buffer, 0, len);
}
data = baos.toByteArray();
fis.close();
parse();
}
private void parse() {
if (data.length < 10 || data[0] != 0x1F || data[1] != (byte)0x8B) {
System.out.println("Invalid GZIP file");
return;
}
int pos = 2;
int cm = data[pos++] & 0xFF;
int flg = data[pos++] & 0xFF;
ByteBuffer bb = ByteBuffer.wrap(data, pos, 4).order(ByteOrder.BIG_ENDIAN);
int mtime = bb.getInt(); pos += 4;
int xfl = data[pos++] & 0xFF;
int os = data[pos++] & 0xFF;
System.out.println("Magic: 0x1F 0x8B");
System.out.printf("CM: %d (deflate if 8)%n", cm);
System.out.printf("FLG: %02x (FTEXT:%d, FHCRC:%d, FEXTRA:%d, FNAME:%d, FCOMMENT:%d)%n",
flg, flg&1, (flg>>1)&1, (flg>>2)&1, (flg>>3)&1, (flg>>4)&1);
System.out.printf("MTIME: %d (Unix ts)%n", mtime);
System.out.printf("XFL: %d%n", xfl);
System.out.printf("OS: %d%n", os);
if ((flg & 2) != 0) { // FHCRC
bb = ByteBuffer.wrap(data, pos, 2).order(ByteOrder.BIG_ENDIAN);
int hcrc = bb.getShort() & 0xFFFF; pos += 2;
System.out.printf("Header CRC16: %04x%n", hcrc);
}
if ((flg & 4) != 0) { // FEXTRA
bb = ByteBuffer.wrap(data, pos, 2).order(ByteOrder.BIG_ENDIAN);
int xlen = bb.getShort() & 0xFFFF; pos += 2;
System.out.printf("XLEN: %d%n", xlen);
StringBuilder extra = new StringBuilder();
for (int i = 0; i < xlen; i++) {
extra.append(String.format("%02x ", data[pos + i] & 0xFF));
}
System.out.println("Extra Fields: " + extra.toString().trim());
pos += xlen;
}
if ((flg & 8) != 0) { // FNAME
StringBuilder fname = new StringBuilder();
while (pos < data.length && data[pos] != 0) {
fname.append((char) data[pos++]);
}
pos++;
System.out.println("Filename: " + fname.toString());
}
if ((flg & 16) != 0) { // FCOMMENT
StringBuilder comment = new StringBuilder();
while (pos < data.length && data[pos] != 0) {
comment.append((char) data[pos++]);
}
pos++;
System.out.println("Comment: " + comment.toString());
}
// Trailer
pos = data.length - 8;
bb = ByteBuffer.wrap(data, pos, 8).order(ByteOrder.BIG_ENDIAN);
int crc32 = bb.getInt();
int isize = bb.getInt();
System.out.printf("CRC32: %08x%n", crc32);
System.out.printf("ISIZE: %d%n", isize);
}
// Usage: new GZParser("example.gz");
}
6. JavaScript Class (Node.js)
const fs = require('fs');
class GZParser {
constructor(filename) {
this.data = fs.readFileSync(filename);
this.parse();
}
parse() {
if (this.data.length < 10 || this.data[0] !== 0x1F || this.data[1] !== 0x8B) {
console.log('Invalid GZIP file');
return;
}
let pos = 2;
const cm = this.data[pos++];
const flg = this.data[pos++];
const mtime = (this.data[pos] << 24) | (this.data[pos+1] << 16) | (this.data[pos+2] << 8) | this.data[pos+3]; pos += 4;
const xfl = this.data[pos++];
const os = this.data[pos++];
console.log(`Magic: 0x1F 0x8B`);
console.log(`CM: ${cm} (deflate if 8)`);
console.log(`FLG: ${flg.toString(16).padStart(2, '0')} (FTEXT:${flg&1}, FHCRC:${(flg>>1)&1}, FEXTRA:${(flg>>2)&1}, FNAME:${(flg>>3)&1}, FCOMMENT:${(flg>>4)&1})`);
console.log(`MTIME: ${mtime} (Unix ts)`);
console.log(`XFL: ${xfl}`);
console.log(`OS: ${os}`);
if (flg & 2) { // FHCRC
const hcrc = (this.data[pos] << 8) | this.data[pos+1]; pos += 2;
console.log(`Header CRC16: ${hcrc.toString(16).padStart(4, '0')}`);
}
if (flg & 4) { // FEXTRA
const xlen = (this.data[pos] << 8) | this.data[pos+1]; pos += 2;
console.log(`XLEN: ${xlen}`);
const extra = this.data.slice(pos, pos + xlen).map(b => b.toString(16).padStart(2, '0')).join(' ');
console.log(`Extra Fields: ${extra}`);
pos += xlen;
}
if (flg & 8) { // FNAME
let fname = '';
while (pos < this.data.length && this.data[pos] !== 0) { fname += String.fromCharCode(this.data[pos++]); }
pos++;
console.log(`Filename: ${fname}`);
}
if (flg & 16) { // FCOMMENT
let comment = '';
while (pos < this.data.length && this.data[pos] !== 0) { comment += String.fromCharCode(this.data[pos++]); }
pos++;
console.log(`Comment: ${comment}`);
}
// Trailer
pos = this.data.length - 8;
const crc32 = (this.data[pos] << 24) | (this.data[pos+1] << 16) | (this.data[pos+2] << 8) | this.data[pos+3];
const isize = (this.data[pos+4] << 24) | (this.data[pos+5] << 16) | (this.data[pos+6] << 8) | this.data[pos+7];
console.log(`CRC32: ${crc32.toString(16).padStart(8, '0')}`);
console.log(`ISIZE: ${isize}`);
}
}
// Usage: new GZParser('example.gz');
7. C Code
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
typedef struct {
uint8_t data[1]; // Flexible array for full file
} GZData;
void parse_gz(const char* filename) {
FILE* f = fopen(filename, "rb");
if (!f) {
printf("Cannot open file\n");
return;
}
fseek(f, 0, SEEK_END);
long len = ftell(f);
fseek(f, 0, SEEK_SET);
uint8_t* data = malloc(len);
fread(data, 1, len, f);
fclose(f);
if (len < 10 || data[0] != 0x1F || data[1] != 0x8B) {
printf("Invalid GZIP file\n");
free(data);
return;
}
int pos = 2;
uint8_t cm = data[pos++];
uint8_t flg = data[pos++];
uint32_t mtime = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3]; pos += 4;
uint8_t xfl = data[pos++];
uint8_t os = data[pos++];
printf("Magic: 0x1F 0x8B\n");
printf("CM: %u (deflate if 8)\n", cm);
printf("FLG: 0x%02x (FTEXT:%u, FHCRC:%u, FEXTRA:%u, FNAME:%u, FCOMMENT:%u)\n",
flg, flg&1, (flg>>1)&1, (flg>>2)&1, (flg>>3)&1, (flg>>4)&1);
printf("MTIME: %u (Unix ts)\n", mtime);
printf("XFL: %u\n", xfl);
printf("OS: %u\n", os);
if (flg & 2) { // FHCRC
uint16_t hcrc = (data[pos] << 8) | data[pos+1]; pos += 2;
printf("Header CRC16: 0x%04x\n", hcrc);
}
if (flg & 4) { // FEXTRA
uint16_t xlen = (data[pos] << 8) | data[pos+1]; pos += 2;
printf("XLEN: %u\n", xlen);
printf("Extra Fields: ");
for (int i = 0; i < xlen; i++) {
printf("%02x ", data[pos + i]);
}
printf("\n");
pos += xlen;
}
if (flg & 8) { // FNAME
printf("Filename: ");
while (pos < len && data[pos] != 0) { putchar(data[pos++]); }
putchar('\n');
pos++;
}
if (flg & 16) { // FCOMMENT
printf("Comment: ");
while (pos < len && data[pos] != 0) { putchar(data[pos++]); }
putchar('\n');
pos++;
}
// Trailer
pos = len - 8;
uint32_t crc32 = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3];
uint32_t isize = (data[pos+4] << 24) | (data[pos+5] << 16) | (data[pos+6] << 8) | data[pos+7];
printf("CRC32: 0x%08x\n", crc32);
printf("ISIZE: %u\n", isize);
free(data);
}
// Usage: parse_gz("example.gz");
int main(int argc, char** argv) {
if (argc > 1) parse_gz(argv[1]);
return 0;
}