Task 571: .PROTO File Format

Task 571: .PROTO File Format

File Format Specifications for .PROTO

The .PROTO file format refers to the binary serialization format used by Protocol Buffers (Protobuf), a language-neutral and platform-neutral mechanism for serializing structured data developed by Google. It is designed for efficiency, being smaller and faster than alternatives like XML or JSON. The format is binary and self-describing to a limited extent, allowing parsing without the schema for raw dumping, though full interpretation requires the corresponding .proto schema definition. The specification is detailed in the Protocol Buffers documentation, particularly the encoding guide.

List of all the properties of this file format intrinsic to its file system:

  • Binary serialization format consisting of a sequence of tag-value pairs.
  • No magic number, header, or footer; the file is purely the serialized data.
  • Tags are base-128 varints combining field number (1 to 2^29-1) and wire type (0-5).
  • Wire types define the value encoding: 0 (Varint for integers, bool, enum), 1 (64-bit fixed for double, fixed64, sfixed64), 2 (Length-delimited for string, bytes, nested messages, packed repeated fields), 5 (32-bit fixed for float, fixed32, sfixed32), 3/4 (deprecated start/end group).
  • Varint encoding: 7-bit chunks in little-endian byte order with MSB as continuation bit (1 for more bytes, 0 to end).
  • ZigZag encoding for signed integers (sint32/sint64) to optimize negative values in varint.
  • Little-endian byte order for fixed-length numeric types.
  • Length-delimited fields prefixed with a varint length (max 2 GiB per field).
  • Packed repeated primitives concatenated within a single length-delimited field.
  • Nested messages encoded as length-delimited sub-sequences of tag-value pairs.
  • Repeated fields can appear multiple times, interleaved with other fields.
  • Maps encoded as repeated nested messages with key/value fields.
  • Unknown fields (unrecognized tags) are preserved for forward/backward compatibility.
  • Overall message size limit typically 2 GiB, though implementations may enforce smaller limits.
  • No built-in compression or encryption; data is platform-independent and extensible.

Two direct download links for files of format .PROTO:

Ghost blog embedded HTML JavaScript for drag-and-drop .PROTO file dumping:

Drag and drop a .PROTO file here
  1. Python class for .PROTO handling:
import struct

class ProtoParser:
    def __init__(self, filename):
        with open(filename, 'rb') as f:
            self.data = f.read()
        self.parsed = self.parse_message(0, len(self.data))

    def read_varint(self, pos):
        result = 0
        shift = 0
        while True:
            b = self.data[pos]
            pos += 1
            result |= (b & 0x7f) << shift
            if not (b & 0x80):
                break
            shift += 7
        return result, pos

    def parse_message(self, start, end):
        fields = []
        pos = start
        while pos < end:
            key, pos = self.read_varint(pos)
            field_number = key >> 3
            wire_type = key & 0x7
            if wire_type == 0:
                value, pos = self.read_varint(pos)
            elif wire_type == 5:
                value = self.data[pos:pos+4]
                pos += 4
            elif wire_type == 1:
                value = self.data[pos:pos+8]
                pos += 8
            elif wire_type == 2:
                length, pos = self.read_varint(pos)
                sub_start = pos
                sub_end = pos + length
                try:
                    sub_parsed = self.parse_message(sub_start, sub_end)
                    if pos == sub_end:
                        value = sub_parsed  # Nested message
                    else:
                        value = self.data[sub_start:sub_end]  # Bytes
                except:
                    value = self.data[sub_start:sub_end]
                pos = sub_end
            else:
                raise ValueError(f"Unknown wire type: {wire_type}")
            fields.append((field_number, wire_type, value))
        return fields

    def print_properties(self, fields=None, indent=''):
        if fields is None:
            fields = self.parsed
        for field_number, wire_type, value in fields:
            print(f"{indent}Field {field_number} (wire type {wire_type}): ", end='')
            if wire_type == 0:
                print(f"varint {value}")
            elif wire_type in (1, 5):
                print(f"fixed [{' '.join(f'{b:02x}' for b in value)}]")
            elif wire_type == 2:
                if isinstance(value, list):
                    print("message {")
                    self.print_properties(value, indent + '  ')
                    print(f"{indent}}")
                else:
                    print(f"bytes [{' '.join(f'{b:02x}' for b in value)}]")
    
    def encode_message(self, fields):
        data = b''
        for field_number, wire_type, value in fields:
            key = (field_number << 3) | wire_type
            data += self.encode_varint(key)
            if wire_type == 0:
                data += self.encode_varint(value)
            elif wire_type == 5:
                data += value
            elif wire_type == 1:
                data += value
            elif wire_type == 2:
                if isinstance(value, list):
                    sub_data = self.encode_message(value)
                else:
                    sub_data = value
                data += self.encode_varint(len(sub_data))
                data += sub_data
        return data

    def encode_varint(self, value):
        data = b''
        while True:
            byte = value & 0x7f
            value >>= 7
            if value:
                byte |= 0x80
            data += bytes([byte])
            if not value:
                break
        return data

    def write(self, filename):
        data = self.encode_message(self.parsed)
        with open(filename, 'wb') as f:
            f.write(data)
  1. Java class for .PROTO handling:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;

public class ProtoParser {
    private byte[] data;
    private List<Field> parsed;

    public static class Field {
        int fieldNumber;
        int wireType;
        Object value; // Long for varint, byte[] for fixed/len, List<Field> for message

        public Field(int fn, int wt, Object v) {
            fieldNumber = fn;
            wireType = wt;
            value = v;
        }
    }

    public ProtoParser(String filename) throws IOException {
        FileInputStream fis = new FileInputStream(filename);
        data = new byte[fis.available()];
        fis.read(data);
        fis.close();
        parsed = parseMessage(0, data.length);
    }

    private int readVarint(int pos[], ByteBuffer bb) {
        long result = 0;
        int shift = 0;
        while (true) {
            byte b = data[pos[0]++];
            result |= (long)(b & 0x7F) << shift;
            if ((b & 0x80) == 0) break;
            shift += 7;
        }
        return (int) result;
    }

    private List<Field> parseMessage(int start, int end) {
        List<Field> fields = new ArrayList<>();
        int[] pos = {start};
        while (pos[0] < end) {
            int key = readVarint(pos, null);
            int fieldNumber = key >> 3;
            int wireType = key & 0x7;
            Object value;
            if (wireType == 0) {
                value = (long) readVarint(pos, null);
            } else if (wireType == 5) {
                byte[] val = new byte[4];
                System.arraycopy(data, pos[0], val, 0, 4);
                pos[0] += 4;
                value = val;
            } else if (wireType == 1) {
                byte[] val = new byte[8];
                System.arraycopy(data, pos[0], val, 0, 8);
                pos[0] += 8;
                value = val;
            } else if (wireType == 2) {
                int length = readVarint(pos, null);
                int subStart = pos[0];
                int subEnd = pos[0] + length;
                List<Field> subParsed = null;
                try {
                    subParsed = parseMessage(subStart, subEnd);
                    if (pos[0] == subEnd) {
                        value = subParsed;
                    } else {
                        byte[] val = new byte[length];
                        System.arraycopy(data, subStart, val, 0, length);
                        value = val;
                    }
                } catch (Exception e) {
                    byte[] val = new byte[length];
                    System.arraycopy(data, subStart, val, 0, length);
                    value = val;
                }
                pos[0] = subEnd;
            } else {
                throw new RuntimeException("Unknown wire type: " + wireType);
            }
            fields.add(new Field(fieldNumber, wireType, value));
        }
        return fields;
    }

    public void printProperties() {
        printProperties(parsed, "");
    }

    private void printProperties(List<Field> fields, String indent) {
        for (Field field : fields) {
            System.out.print(indent + "Field " + field.fieldNumber + " (wire type " + field.wireType + "): ");
            if (field.wireType == 0) {
                System.out.println("varint " + field.value);
            } else if (field.wireType == 1 || field.wireType == 5) {
                byte[] bytes = (byte[]) field.value;
                System.out.print("fixed [");
                for (byte b : bytes) System.out.print(String.format("%02x ", b));
                System.out.println("]");
            } else if (field.wireType == 2) {
                if (field.value instanceof List) {
                    System.out.println("message {");
                    printProperties((List<Field>) field.value, indent + "  ");
                    System.out.println(indent + "}");
                } else {
                    byte[] bytes = (byte[]) field.value;
                    System.out.print("bytes [");
                    for (byte b : bytes) System.out.print(String.format("%02x ", b));
                    System.out.println("]");
                }
            }
        }
    }

    public void write(String filename) throws IOException {
        byte[] encoded = encodeMessage(parsed);
        FileOutputStream fos = new FileOutputStream(filename);
        fos.write(encoded);
        fos.close();
    }

    private byte[] encodeMessage(List<Field> fields) {
        List<Byte> bytesList = new ArrayList<>();
        for (Field field : fields) {
            int key = (field.fieldNumber << 3) | field.wireType;
            appendVarint(bytesList, key);
            if (field.wireType == 0) {
                appendVarint(bytesList, ((Long) field.value).intValue());
            } else if (field.wireType == 1 || field.wireType == 5) {
                byte[] val = (byte[]) field.value;
                for (byte b : val) bytesList.add(b);
            } else if (field.wireType == 2) {
                byte[] subData;
                if (field.value instanceof List) {
                    subData = encodeMessage((List<Field>) field.value);
                } else {
                    subData = (byte[]) field.value;
                }
                appendVarint(bytesList, subData.length);
                for (byte b : subData) bytesList.add(b);
            }
        }
        byte[] result = new byte[bytesList.size()];
        for (int i = 0; i < result.length; i++) result[i] = bytesList.get(i);
        return result;
    }

    private void appendVarint(List<Byte> list, int value) {
        while (true) {
            int byteVal = value & 0x7f;
            value >>= 7;
            if (value != 0) byteVal |= 0x80;
            list.add((byte) byteVal);
            if (value == 0) break;
        }
    }
}
  1. JavaScript class for .PROTO handling:
class ProtoParser {
  constructor(filename) {
    // Note: In Node.js, use fs to read file
    const fs = require('fs');
    this.data = new Uint8Array(fs.readFileSync(filename));
    this.parsed = this.parseMessage(0, this.data.length);
  }

  readVarint(pos) {
    let result = 0n;
    let shift = 0n;
    while (true) {
      const b = this.data[pos++];
      result |= BigInt(b & 0x7F) << shift;
      if (!(b & 0x80)) break;
      shift += 7n;
    }
    return [Number(result), pos];
  }

  parseMessage(start, end) {
    const fields = [];
    let pos = start;
    while (pos < end) {
      let [key, newPos] = this.readVarint(pos);
      pos = newPos;
      const fieldNumber = key >> 3;
      const wireType = key & 0x7;
      let value;
      if (wireType === 0) {
        [value, pos] = this.readVarint(pos);
      } else if (wireType === 5) {
        value = this.data.subarray(pos, pos + 4);
        pos += 4;
      } else if (wireType === 1) {
        value = this.data.subarray(pos, pos + 8);
        pos += 8;
      } else if (wireType === 2) {
        let length;
        [length, pos] = this.readVarint(pos);
        const subEnd = pos + length;
        const subData = this.data.subarray(pos, subEnd);
        let subParsed;
        try {
          subParsed = this.parseMessage(pos, subEnd);
          value = subParsed;
        } catch {
          value = subData;
        }
        pos = subEnd;
      } else {
        throw new Error(`Unknown wire type: ${wireType}`);
      }
      fields.push({ fieldNumber, wireType, value });
    }
    return fields;
  }

  printProperties(fields = this.parsed, indent = '') {
    fields.forEach(field => {
      console.log(`${indent}Field ${field.fieldNumber} (wire type ${field.wireType}): `);
      if (field.wireType === 0) {
        console.log(`varint ${field.value}`);
      } else if (field.wireType === 1 || field.wireType === 5) {
        console.log(`fixed [${Array.from(field.value).map(b => b.toString(16).padStart(2, '0')).join(' ')}]`);
      } else if (field.wireType === 2) {
        if (Array.isArray(field.value)) {
          console.log('message {');
          this.printProperties(field.value, indent + '  ');
          console.log(`${indent}}`);
        } else {
          console.log(`bytes [${Array.from(field.value).map(b => b.toString(16).padStart(2, '0')).join(' ')}]`);
        }
      }
    });
  }

  encodeMessage(fields) {
    const bytes = [];
    fields.forEach(field => {
      const key = (field.fieldNumber << 3) | field.wireType;
      this.appendVarint(bytes, key);
      if (field.wireType === 0) {
        this.appendVarint(bytes, field.value);
      } else if (field.wireType === 1 || field.wireType === 5) {
        bytes.push(...field.value);
      } else if (field.wireType === 2) {
        let subData;
        if (Array.isArray(field.value)) {
          subData = this.encodeMessage(field.value);
        } else {
          subData = Array.from(field.value);
        }
        this.appendVarint(bytes, subData.length);
        bytes.push(...subData);
      }
    });
    return new Uint8Array(bytes);
  }

  appendVarint(bytes, value) {
    while (true) {
      let byteVal = value & 0x7f;
      value >>= 7;
      if (value) byteVal |= 0x80;
      bytes.push(byteVal);
      if (!value) break;
    }
  }

  write(filename) {
    const fs = require('fs');
    const encoded = this.encodeMessage(this.parsed);
    fs.writeFileSync(filename, encoded);
  }
}
  1. C++ class for .PROTO handling:
#include <fstream>
#include <vector>
#include <string>
#include <iostream>
#include <iomanip>
#include <cstdint>
#include <stdexcept>

struct Field {
    uint32_t field_number;
    uint32_t wire_type;
    std::vector<uint8_t> value; // For varint, encode to bytes; for message, serialize sub-fields
    std::vector<Field> sub_fields; // If message
};

class ProtoParser {
private:
    std::vector<uint8_t> data;
    std::vector<Field> parsed;

    uint64_t read_varint(size_t& pos) {
        uint64_t result = 0;
        int shift = 0;
        while (true) {
            uint8_t b = data[pos++];
            result |= static_cast<uint64_t>(b & 0x7F) << shift;
            if (!(b & 0x80)) break;
            shift += 7;
            if (shift > 63) throw std::runtime_error("Varint too long");
        }
        return result;
    }

    std::vector<Field> parse_message(size_t start, size_t end) {
        std::vector<Field> fields;
        size_t pos = start;
        while (pos < end) {
            uint64_t key = read_varint(pos);
            uint32_t field_number = key >> 3;
            uint32_t wire_type = key & 0x7;
            Field f;
            f.field_number = field_number;
            f.wire_type = wire_type;
            if (wire_type == 0) {
                uint64_t val = read_varint(pos);
                f.value = encode_varint(val); // Store as bytes for consistency
            } else if (wire_type == 5) {
                f.value.assign(data.begin() + pos, data.begin() + pos + 4);
                pos += 4;
            } else if (wire_type == 1) {
                f.value.assign(data.begin() + pos, data.begin() + pos + 8);
                pos += 8;
            } else if (wire_type == 2) {
                uint64_t length = read_varint(pos);
                size_t sub_start = pos;
                size_t sub_end = pos + length;
                try {
                    auto sub_parsed = parse_message(sub_start, sub_end);
                    if (pos == sub_end) {
                        f.sub_fields = sub_parsed;
                    } else {
                        f.value.assign(data.begin() + sub_start, data.begin() + sub_end);
                    }
                } catch (...) {
                    f.value.assign(data.begin() + sub_start, data.begin() + sub_end);
                }
                pos = sub_end;
            } else {
                throw std::runtime_error("Unknown wire type");
            }
            fields.push_back(f);
        }
        return fields;
    }

    void print_properties(const std::vector<Field>& fields, const std::string& indent) {
        for (const auto& field : fields) {
            std::cout << indent << "Field " << field.field_number << " (wire type " << field.wire_type << "): ";
            if (field.wire_type == 0) {
                size_t p = 0;
                uint64_t val = read_varint_from_bytes(field.value, p);
                std::cout << "varint " << val << std::endl;
            } else if (field.wire_type == 1 || field.wire_type == 5) {
                std::cout << "fixed [";
                for (size_t i = 0; i < field.value.size(); ++i) {
                    std::cout << std::hex << std::setfill('0') << std::setw(2) << static_cast<int>(field.value[i]) << (i < field.value.size() - 1 ? " " : "");
                }
                std::cout << "]" << std::dec << std::endl;
            } else if (field.wire_type == 2) {
                if (!field.sub_fields.empty()) {
                    std::cout << "message {" << std::endl;
                    print_properties(field.sub_fields, indent + "  ");
                    std::cout << indent << "}" << std::endl;
                } else {
                    std::cout << "bytes [";
                    for (size_t i = 0; i < field.value.size(); ++i) {
                        std::cout << std::hex << std::setfill('0') << std::setw(2) << static_cast<int>(field.value[i]) << (i < field.value.size() - 1 ? " " : "");
                    }
                    std::cout << "]" << std::dec << std::endl;
                }
            }
        }
    }

    std::vector<uint8_t> encode_varint(uint64_t value) {
        std::vector<uint8_t> bytes;
        while (true) {
            uint8_t byte_val = value & 0x7F;
            value >>= 7;
            if (value) byte_val |= 0x80;
            bytes.push_back(byte_val);
            if (!value) break;
        }
        return bytes;
    }

    uint64_t read_varint_from_bytes(const std::vector<uint8_t>& bytes, size_t& pos) {
        uint64_t result = 0;
        int shift = 0;
        while (true) {
            uint8_t b = bytes[pos++];
            result |= static_cast<uint64_t>(b & 0x7F) << shift;
            if (!(b & 0x80)) break;
            shift += 7;
        }
        return result;
    }

    std::vector<uint8_t> encode_message(const std::vector<Field>& fields) {
        std::vector<uint8_t> result;
        for (const auto& field : fields) {
            uint64_t key = (static_cast<uint64_t>(field.field_number) << 3) | field.wire_type;
            auto key_bytes = encode_varint(key);
            result.insert(result.end(), key_bytes.begin(), key_bytes.end());
            if (field.wire_type == 0) {
                result.insert(result.end(), field.value.begin(), field.value.end());
            } else if (field.wire_type == 1 || field.wire_type == 5) {
                result.insert(result.end(), field.value.begin(), field.value.end());
            } else if (field.wire_type == 2) {
                std::vector<uint8_t> sub_data;
                if (!field.sub_fields.empty()) {
                    sub_data = encode_message(field.sub_fields);
                } else {
                    sub_data = field.value;
                }
                auto len_bytes = encode_varint(sub_data.size());
                result.insert(result.end(), len_bytes.begin(), len_bytes.end());
                result.insert(result.end(), sub_data.begin(), sub_data.end());
            }
        }
        return result;
    }

public:
    ProtoParser(const std::string& filename) {
        std::ifstream file(filename, std::ios::binary);
        if (!file) throw std::runtime_error("Cannot open file");
        data = std::vector<uint8_t>((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
        parsed = parse_message(0, data.size());
    }

    void print_properties() {
        print_properties(parsed, "");
    }

    void write(const std::string& filename) {
        auto encoded = encode_message(parsed);
        std::ofstream file(filename, std::ios::binary);
        file.write(reinterpret_cast<const char*>(encoded.data()), encoded.size());
    }
};