Task 277: .GZ File Format

Task 277: .GZ File Format

1. Properties of the .GZ File Format

The .GZ file format (GZIP, version 4.3) is a lossless compression format defined in RFC 1952. It consists of one or more concatenated "members," each with a fixed 10-byte header, optional fields, compressed data (using DEFLATE), and a 8-byte trailer. Below is a complete list of all intrinsic properties (fields and their attributes) in the format, focusing on structural elements independent of the underlying file system:

  • Magic Number (ID1 and ID2): 2 bytes (fixed values: 0x1F followed by 0x8B), identifies the file as GZIP.
  • Compression Method (CM): 1 byte (typically 8 for DEFLATE; values 0-7 reserved).
  • Flags (FLG): 1 byte bitfield:
  • Bit 0 (FTEXT): 1 if the original data is likely ASCII text.
  • Bit 1 (FHCRC): 1 if a 2-byte header CRC16 follows the fixed header.
  • Bit 2 (FEXTRA): 1 if an extra field is present.
  • Bit 3 (FNAME): 1 if an original filename string follows.
  • Bit 4 (FCOMMENT): 1 if a comment string follows.
  • Bits 5-7: Reserved (must be 0).
  • Modification Time (MTIME): 4 bytes (Unix timestamp in seconds since 1970-01-01 00:00:00 UTC; 0 if unknown).
  • Extra Flags (XFL): 1 byte (for DEFLATE: 2 = maximum compression, 4 = fastest compression; other values implementation-specific).
  • Operating System (OS): 1 byte (identifies the OS where compression occurred: 0=FAT/MS-DOS, 1=Amiga, 2=VMS, 3=Unix, 4=VM/CMS, 5=Atari TOS, 6=HPFS, 7=Macintosh, 8=Z-System, 9=CP/M, 10=TOPS-20, 11=NTFS, 12=QDOS, 13=Acorn RISCOS, 255=unknown).
  • Header CRC16: 2 bytes (CRC-16 checksum of the 10-byte fixed header; present only if FLG.FHCRC is set).
  • Extra Field Length (XLEN): 2 bytes (length of the extra field in bytes; present only if FLG.FEXTRA is set).
  • Extra Fields: Variable length (subfields with 1-byte SI1, 1-byte SI2 identifiers, 2-byte LEN, and LEN bytes of data; present only if FLG.FEXTRA is set; e.g., SI1=0x41, SI2=0x70 for Apollo file type).
  • Original Filename: Variable-length NUL-terminated string (in ISO-8859-1 encoding, lowercase if from case-insensitive FS; present only if FLG.FNAME is set; empty if data from non-file source).
  • File Comment: Variable-length NUL-terminated string (in ISO-8859-1 encoding, for human-readable notes; present only if FLG.FCOMMENT is set).
  • CRC32: 4 bytes (32-bit CRC of the uncompressed original data).
  • ISIZE: 4 bytes (length of the uncompressed original data, modulo 2^32).

These properties are read sequentially after the fixed header, followed by variable-length compressed data (not listed as a printable property here, as it is opaque binary).

3. Ghost Blog Embedded HTML JavaScript

Embed the following self-contained HTML snippet into a Ghost blog post (use the HTML card in the editor). It creates a drag-and-drop zone for .GZ files. Upon drop, it parses the header using the File API and dumps all properties to a <pre> block below.

Drag and drop a .GZ file here

4. Python Class

import struct

class GZParser:
    def __init__(self, filename):
        with open(filename, 'rb') as f:
            self.data = f.read()
        self.parse()

    def parse(self):
        if len(self.data) < 10 or self.data[0] != 0x1F or self.data[1] != 0x8B:
            print("Invalid GZIP file")
            return
        pos = 2
        cm, flg = struct.unpack('BB', self.data[pos:pos+2]); pos += 2
        mtime, = struct.unpack('>I', self.data[pos:pos+4]); pos += 4
        xfl, os = struct.unpack('BB', self.data[pos:pos+2]); pos += 2
        print(f"Magic: 0x1F 0x8B")
        print(f"CM: {cm} (deflate if 8)")
        print(f"FLG: {flg:02x} (FTEXT:{flg&1}, FHCRC:{(flg>>1)&1}, FEXTRA:{(flg>>2)&1}, FNAME:{(flg>>3)&1}, FCOMMENT:{(flg>>4)&1})")
        print(f"MTIME: {mtime} (Unix ts)")
        print(f"XFL: {xfl}")
        print(f"OS: {os}")
        if flg & 2:  # FHCRC
            hcrc, = struct.unpack('>H', self.data[pos:pos+2]); pos += 2
            print(f"Header CRC16: {hcrc:04x}")
        if flg & 4:  # FEXTRA
            xlen, = struct.unpack('>H', self.data[pos:pos+2]); pos += 2
            print(f"XLEN: {xlen}")
            extra = self.data[pos:pos+xlen]
            print(f"Extra Fields: {' '.join(f'{b:02x}' for b in extra)}")
            pos += xlen
        if flg & 8:  # FNAME
            fname_end = self.data.find(b'\x00', pos)
            fname = self.data[pos:fname_end].decode('latin-1')
            pos = fname_end + 1
            print(f"Filename: {fname}")
        if flg & 16:  # FCOMMENT
            comment_end = self.data.find(b'\x00', pos)
            comment = self.data[pos:comment_end].decode('latin-1')
            pos = comment_end + 1
            print(f"Comment: {comment}")
        # Trailer
        pos = len(self.data) - 8
        crc32, isize = struct.unpack('>II', self.data[pos:pos+8])
        print(f"CRC32: {crc32:08x}")
        print(f"ISIZE: {isize}")

# Usage: parser = GZParser('example.gz')

5. Java Class

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;

public class GZParser {
    private byte[] data;

    public GZParser(String filename) throws IOException {
        FileInputStream fis = new FileInputStream(filename);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len;
        while ((len = fis.read(buffer)) != -1) {
            baos.write(buffer, 0, len);
        }
        data = baos.toByteArray();
        fis.close();
        parse();
    }

    private void parse() {
        if (data.length < 10 || data[0] != 0x1F || data[1] != (byte)0x8B) {
            System.out.println("Invalid GZIP file");
            return;
        }
        int pos = 2;
        int cm = data[pos++] & 0xFF;
        int flg = data[pos++] & 0xFF;
        ByteBuffer bb = ByteBuffer.wrap(data, pos, 4).order(ByteOrder.BIG_ENDIAN);
        int mtime = bb.getInt(); pos += 4;
        int xfl = data[pos++] & 0xFF;
        int os = data[pos++] & 0xFF;
        System.out.println("Magic: 0x1F 0x8B");
        System.out.printf("CM: %d (deflate if 8)%n", cm);
        System.out.printf("FLG: %02x (FTEXT:%d, FHCRC:%d, FEXTRA:%d, FNAME:%d, FCOMMENT:%d)%n",
                flg, flg&1, (flg>>1)&1, (flg>>2)&1, (flg>>3)&1, (flg>>4)&1);
        System.out.printf("MTIME: %d (Unix ts)%n", mtime);
        System.out.printf("XFL: %d%n", xfl);
        System.out.printf("OS: %d%n", os);
        if ((flg & 2) != 0) { // FHCRC
            bb = ByteBuffer.wrap(data, pos, 2).order(ByteOrder.BIG_ENDIAN);
            int hcrc = bb.getShort() & 0xFFFF; pos += 2;
            System.out.printf("Header CRC16: %04x%n", hcrc);
        }
        if ((flg & 4) != 0) { // FEXTRA
            bb = ByteBuffer.wrap(data, pos, 2).order(ByteOrder.BIG_ENDIAN);
            int xlen = bb.getShort() & 0xFFFF; pos += 2;
            System.out.printf("XLEN: %d%n", xlen);
            StringBuilder extra = new StringBuilder();
            for (int i = 0; i < xlen; i++) {
                extra.append(String.format("%02x ", data[pos + i] & 0xFF));
            }
            System.out.println("Extra Fields: " + extra.toString().trim());
            pos += xlen;
        }
        if ((flg & 8) != 0) { // FNAME
            StringBuilder fname = new StringBuilder();
            while (pos < data.length && data[pos] != 0) {
                fname.append((char) data[pos++]);
            }
            pos++;
            System.out.println("Filename: " + fname.toString());
        }
        if ((flg & 16) != 0) { // FCOMMENT
            StringBuilder comment = new StringBuilder();
            while (pos < data.length && data[pos] != 0) {
                comment.append((char) data[pos++]);
            }
            pos++;
            System.out.println("Comment: " + comment.toString());
        }
        // Trailer
        pos = data.length - 8;
        bb = ByteBuffer.wrap(data, pos, 8).order(ByteOrder.BIG_ENDIAN);
        int crc32 = bb.getInt();
        int isize = bb.getInt();
        System.out.printf("CRC32: %08x%n", crc32);
        System.out.printf("ISIZE: %d%n", isize);
    }

    // Usage: new GZParser("example.gz");
}

6. JavaScript Class (Node.js)

const fs = require('fs');

class GZParser {
  constructor(filename) {
    this.data = fs.readFileSync(filename);
    this.parse();
  }

  parse() {
    if (this.data.length < 10 || this.data[0] !== 0x1F || this.data[1] !== 0x8B) {
      console.log('Invalid GZIP file');
      return;
    }
    let pos = 2;
    const cm = this.data[pos++];
    const flg = this.data[pos++];
    const mtime = (this.data[pos] << 24) | (this.data[pos+1] << 16) | (this.data[pos+2] << 8) | this.data[pos+3]; pos += 4;
    const xfl = this.data[pos++];
    const os = this.data[pos++];
    console.log(`Magic: 0x1F 0x8B`);
    console.log(`CM: ${cm} (deflate if 8)`);
    console.log(`FLG: ${flg.toString(16).padStart(2, '0')} (FTEXT:${flg&1}, FHCRC:${(flg>>1)&1}, FEXTRA:${(flg>>2)&1}, FNAME:${(flg>>3)&1}, FCOMMENT:${(flg>>4)&1})`);
    console.log(`MTIME: ${mtime} (Unix ts)`);
    console.log(`XFL: ${xfl}`);
    console.log(`OS: ${os}`);
    if (flg & 2) { // FHCRC
      const hcrc = (this.data[pos] << 8) | this.data[pos+1]; pos += 2;
      console.log(`Header CRC16: ${hcrc.toString(16).padStart(4, '0')}`);
    }
    if (flg & 4) { // FEXTRA
      const xlen = (this.data[pos] << 8) | this.data[pos+1]; pos += 2;
      console.log(`XLEN: ${xlen}`);
      const extra = this.data.slice(pos, pos + xlen).map(b => b.toString(16).padStart(2, '0')).join(' ');
      console.log(`Extra Fields: ${extra}`);
      pos += xlen;
    }
    if (flg & 8) { // FNAME
      let fname = '';
      while (pos < this.data.length && this.data[pos] !== 0) { fname += String.fromCharCode(this.data[pos++]); }
      pos++;
      console.log(`Filename: ${fname}`);
    }
    if (flg & 16) { // FCOMMENT
      let comment = '';
      while (pos < this.data.length && this.data[pos] !== 0) { comment += String.fromCharCode(this.data[pos++]); }
      pos++;
      console.log(`Comment: ${comment}`);
    }
    // Trailer
    pos = this.data.length - 8;
    const crc32 = (this.data[pos] << 24) | (this.data[pos+1] << 16) | (this.data[pos+2] << 8) | this.data[pos+3];
    const isize = (this.data[pos+4] << 24) | (this.data[pos+5] << 16) | (this.data[pos+6] << 8) | this.data[pos+7];
    console.log(`CRC32: ${crc32.toString(16).padStart(8, '0')}`);
    console.log(`ISIZE: ${isize}`);
  }
}

// Usage: new GZParser('example.gz');

7. C Code

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

typedef struct {
    uint8_t data[1]; // Flexible array for full file
} GZData;

void parse_gz(const char* filename) {
    FILE* f = fopen(filename, "rb");
    if (!f) {
        printf("Cannot open file\n");
        return;
    }
    fseek(f, 0, SEEK_END);
    long len = ftell(f);
    fseek(f, 0, SEEK_SET);
    uint8_t* data = malloc(len);
    fread(data, 1, len, f);
    fclose(f);

    if (len < 10 || data[0] != 0x1F || data[1] != 0x8B) {
        printf("Invalid GZIP file\n");
        free(data);
        return;
    }
    int pos = 2;
    uint8_t cm = data[pos++];
    uint8_t flg = data[pos++];
    uint32_t mtime = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3]; pos += 4;
    uint8_t xfl = data[pos++];
    uint8_t os = data[pos++];
    printf("Magic: 0x1F 0x8B\n");
    printf("CM: %u (deflate if 8)\n", cm);
    printf("FLG: 0x%02x (FTEXT:%u, FHCRC:%u, FEXTRA:%u, FNAME:%u, FCOMMENT:%u)\n",
           flg, flg&1, (flg>>1)&1, (flg>>2)&1, (flg>>3)&1, (flg>>4)&1);
    printf("MTIME: %u (Unix ts)\n", mtime);
    printf("XFL: %u\n", xfl);
    printf("OS: %u\n", os);
    if (flg & 2) { // FHCRC
        uint16_t hcrc = (data[pos] << 8) | data[pos+1]; pos += 2;
        printf("Header CRC16: 0x%04x\n", hcrc);
    }
    if (flg & 4) { // FEXTRA
        uint16_t xlen = (data[pos] << 8) | data[pos+1]; pos += 2;
        printf("XLEN: %u\n", xlen);
        printf("Extra Fields: ");
        for (int i = 0; i < xlen; i++) {
            printf("%02x ", data[pos + i]);
        }
        printf("\n");
        pos += xlen;
    }
    if (flg & 8) { // FNAME
        printf("Filename: ");
        while (pos < len && data[pos] != 0) { putchar(data[pos++]); }
        putchar('\n');
        pos++;
    }
    if (flg & 16) { // FCOMMENT
        printf("Comment: ");
        while (pos < len && data[pos] != 0) { putchar(data[pos++]); }
        putchar('\n');
        pos++;
    }
    // Trailer
    pos = len - 8;
    uint32_t crc32 = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3];
    uint32_t isize = (data[pos+4] << 24) | (data[pos+5] << 16) | (data[pos+6] << 8) | data[pos+7];
    printf("CRC32: 0x%08x\n", crc32);
    printf("ISIZE: %u\n", isize);
    free(data);
}

// Usage: parse_gz("example.gz");
int main(int argc, char** argv) {
    if (argc > 1) parse_gz(argv[1]);
    return 0;
}