Task 283: .HDF5 File Format
Task 283: .HDF5 File Format
The Hierarchical Data Format version 5 (HDF5) file format specifications are maintained by The HDF Group. The latest version is the HDF5 File Format Specification Version 3.0, accessible at the official documentation site.
- The properties intrinsic to the HDF5 file format, as derived from its superblock structure (which defines core file system characteristics such as addressing, consistency, and layout), are as follows for the latest superblock version (version 3). These properties are essential for file identification, navigation, and integrity:
- Format Signature: An 8-byte constant sequence used to identify the file as HDF5 and detect potential corruption or transfer issues.
- Version Number of the Superblock: A 1-byte value indicating the superblock format version (e.g., 3).
- Size of Offsets: A 1-byte value specifying the number of bytes used for storing addresses within the file.
- Size of Lengths: A 1-byte value specifying the number of bytes used for storing object sizes.
- File Consistency Flags: A 1-byte field containing flags for file locking and access modes (e.g., single-writer/multiple-reader support).
- Base Address: A variable-length field (determined by Size of Offsets) representing the absolute address of the first byte of HDF5 data.
- Superblock Extension Address: A variable-length field (determined by Size of Offsets) pointing to any superblock extension, or an undefined value if absent.
- End of File Address: A variable-length field (determined by Size of Offsets) indicating the address immediately following the last byte of HDF5 data.
- Root Group Object Header Address: A variable-length field (determined by Size of Offsets) pointing to the root group's object header, serving as the entry point to the file's hierarchical structure.
- Superblock Checksum: A 4-byte checksum for verifying the integrity of the superblock.
Note that earlier superblock versions (e.g., 0, 1, or 2) include additional or variant properties, such as versions for free-space storage and group node parameters, but the above represent the core set for modern files.
- Two direct download links for sample HDF5 files are:
- https://people.math.sc.edu/Burkardt/data/hdf5/test1.h5
- https://github.com/openPMD/openPMD-example-datasets/raw/main/structure.h5
- The following is an HTML snippet with embedded JavaScript suitable for embedding in a Ghost blog post. It creates a drag-and-drop area for an HDF5 file and parses the superblock to display the properties on the screen. The parser handles superblock versions 0-3 for compatibility with various files.
Drop HDF5 file here
- The following Python class provides functionality to open an HDF5 file, decode and read the superblock properties, print them to the console, and includes a basic write method to recreate a minimal superblock in a new file (for demonstration; full file writing requires additional implementation beyond properties).
import struct
import os
class HDF5Parser:
def __init__(self, filepath):
self.filepath = filepath
self.properties = {}
def read_and_decode(self):
with open(self.filepath, 'rb') as f:
data = f.read(1024) # Read sufficient for superblock
sig = struct.unpack('<8B', data[0:8])
if list(sig) != [137, 72, 68, 70, 13, 10, 26, 10]:
raise ValueError("Invalid HDF5 signature")
version = struct.unpack('<B', data[8:9])[0]
self.properties['Format Signature'] = ''.join(chr(b) for b in sig)
self.properties['Superblock Version'] = version
pos = 9
if version in (0, 1):
self.properties['Free Space Version'] = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Root Group Version'] = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
pos += 1 # Reserved
self.properties['Shared Header Message Version'] = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
offset_size = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Size of Offsets'] = offset_size
length_size = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Size of Lengths'] = length_size
pos += 1 # Reserved
self.properties['Group Leaf Node K'] = struct.unpack('<H', data[pos:pos+2])[0]; pos += 2
self.properties['Group Internal Node K'] = struct.unpack('<H', data[pos:pos+2])[0]; pos += 2
if version == 0:
pos += 4 # Reserved
else:
self.properties['File Consistency Flags'] = struct.unpack('<I', data[pos:pos+4])[0]; pos += 4
self.properties['Base Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['Free Space Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['End of File Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['Driver Info Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
elif version in (2, 3):
offset_size = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Size of Offsets'] = offset_size
length_size = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Size of Lengths'] = length_size
self.properties['File Consistency Flags'] = struct.unpack('<B', data[pos:pos+1])[0]; pos += 1
self.properties['Base Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['Superblock Extension Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['End of File Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['Root Group Object Header Address'] = int.from_bytes(data[pos:pos+offset_size], 'little'); pos += offset_size
self.properties['Superblock Checksum'] = struct.unpack('<I', data[pos:pos+4])[0]
else:
raise ValueError("Unsupported superblock version")
def print_properties(self):
for key, value in self.properties.items():
print(f"{key}: {value}")
def write_minimal(self, output_path):
# Writes a minimal superblock (version 3 example); not a full valid HDF5 file
with open(output_path, 'wb') as f:
sig = b'\x89HDF\r\n\x1a\n'
f.write(sig)
f.write(struct.pack('<B', 3)) # Version
f.write(struct.pack('<B', 8)) # Offset size
f.write(struct.pack('<B', 8)) # Length size
f.write(struct.pack('<B', 0)) # Flags
base_addr = (0).to_bytes(8, 'little')
f.write(base_addr)
ext_addr = (18446744073709551615).to_bytes(8, 'little') # Undefined
f.write(ext_addr)
eof_addr = (1024).to_bytes(8, 'little') # Example EOF
f.write(eof_addr)
root_addr = (96).to_bytes(8, 'little') # Example root
f.write(root_addr)
checksum = (0).to_bytes(4, 'little') # Placeholder
f.write(checksum)
# Additional data would be needed for a valid file
- The following Java class provides similar functionality to open an HDF5 file, decode and read the superblock properties, print them to the console, and includes a basic write method to recreate a minimal superblock in a new file.
import java.io.*;
import java.nio.*;
import java.nio.channels.FileChannel;
import java.util.HashMap;
import java.util.Map;
public class HDF5Parser {
private String filepath;
private Map<String, Object> properties = new HashMap<>();
public HDF5Parser(String filepath) {
this.filepath = filepath;
}
public void readAndDecode() throws IOException {
try (RandomAccessFile file = new RandomAccessFile(filepath, "r");
FileChannel channel = file.getChannel()) {
ByteBuffer buffer = ByteBuffer.allocate(1024).order(ByteOrder.LITTLE_ENDIAN);
channel.read(buffer);
buffer.flip();
byte[] sig = new byte[8];
buffer.get(sig);
if (!new String(sig).equals("\u0089HDF\r\n\u001a\n")) {
throw new IOException("Invalid HDF5 signature");
}
properties.put("Format Signature", new String(sig));
int version = buffer.get() & 0xFF;
properties.put("Superblock Version", version);
if (version == 0 || version == 1) {
properties.put("Free Space Version", buffer.get() & 0xFF);
properties.put("Root Group Version", buffer.get() & 0xFF);
buffer.get(); // Reserved
properties.put("Shared Header Message Version", buffer.get() & 0xFF);
int offsetSize = buffer.get() & 0xFF;
properties.put("Size of Offsets", offsetSize);
int lengthSize = buffer.get() & 0xFF;
properties.put("Size of Lengths", lengthSize);
buffer.get(); // Reserved
properties.put("Group Leaf Node K", buffer.getShort() & 0xFFFF);
properties.put("Group Internal Node K", buffer.getShort() & 0xFFFF);
if (version == 0) {
buffer.position(buffer.position() + 4); // Reserved
} else {
properties.put("File Consistency Flags", buffer.getInt());
}
properties.put("Base Address", getLong(buffer, offsetSize));
properties.put("Free Space Address", getLong(buffer, offsetSize));
properties.put("End of File Address", getLong(buffer, offsetSize));
properties.put("Driver Info Address", getLong(buffer, offsetSize));
} else if (version == 2 || version == 3) {
int offsetSize = buffer.get() & 0xFF;
properties.put("Size of Offsets", offsetSize);
int lengthSize = buffer.get() & 0xFF;
properties.put("Size of Lengths", lengthSize);
properties.put("File Consistency Flags", buffer.get() & 0xFF);
properties.put("Base Address", getLong(buffer, offsetSize));
properties.put("Superblock Extension Address", getLong(buffer, offsetSize));
properties.put("End of File Address", getLong(buffer, offsetSize));
properties.put("Root Group Object Header Address", getLong(buffer, offsetSize));
properties.put("Superblock Checksum", buffer.getInt());
} else {
throw new IOException("Unsupported superblock version");
}
}
}
private long getLong(ByteBuffer buffer, int size) {
long val = 0;
for (int i = 0; i < size; i++) {
val |= (long) (buffer.get() & 0xFF) << (i * 8);
}
return val;
}
public void printProperties() {
for (Map.Entry<String, Object> entry : properties.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue());
}
}
public void writeMinimal(String outputPath) throws IOException {
try (RandomAccessFile file = new RandomAccessFile(outputPath, "rw");
FileChannel channel = file.getChannel()) {
ByteBuffer buffer = ByteBuffer.allocate(1024).order(ByteOrder.LITTLE_ENDIAN);
buffer.put("\u0089HDF\r\n\u001a\n".getBytes());
buffer.put((byte) 3); // Version
buffer.put((byte) 8); // Offset size
buffer.put((byte) 8); // Length size
buffer.put((byte) 0); // Flags
putLong(buffer, 0, 8); // Base addr
putLong(buffer, -1L, 8); // Ext addr (undefined)
putLong(buffer, 1024, 8); // EOF
putLong(buffer, 96, 8); // Root addr
buffer.putInt(0); // Checksum placeholder
buffer.flip();
channel.write(buffer);
}
}
private void putLong(ByteBuffer buffer, long val, int size) {
for (int i = 0; i < size; i++) {
buffer.put((byte) (val >>> (i * 8)));
}
}
}
- The following JavaScript class (for Node.js environment, requiring 'fs' module) provides functionality to open an HDF5 file, decode and read the superblock properties, print them to the console, and includes a basic write method to recreate a minimal superblock in a new file.
const fs = require('fs');
class HDF5Parser {
constructor(filepath) {
this.filepath = filepath;
this.properties = {};
}
readAndDecode() {
const data = fs.readFileSync(this.filepath);
const view = new DataView(data.buffer);
const sig = Array.from({length: 8}, (_, i) => view.getUint8(i));
if (sig.join(',') !== '137,72,68,70,13,10,26,10') {
throw new Error('Invalid HDF5 signature');
}
const version = view.getUint8(8);
this.properties['Format Signature'] = sig.map(b => String.fromCharCode(b)).join('');
this.properties['Superblock Version'] = version;
let pos = 9;
if (version === 0 || version === 1) {
this.properties['Free Space Version'] = view.getUint8(pos++);
this.properties['Root Group Version'] = view.getUint8(pos++);
pos++; // Reserved
this.properties['Shared Header Message Version'] = view.getUint8(pos++);
const offsetSize = view.getUint8(pos++);
this.properties['Size of Offsets'] = offsetSize;
const lengthSize = view.getUint8(pos++);
this.properties['Size of Lengths'] = lengthSize;
pos++; // Reserved
this.properties['Group Leaf Node K'] = view.getUint16(pos, true); pos += 2;
this.properties['Group Internal Node K'] = view.getUint16(pos, true); pos += 2;
if (version === 0) {
pos += 4; // Reserved
} else {
this.properties['File Consistency Flags'] = view.getUint32(pos, true); pos += 4;
}
this.properties['Base Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['Free Space Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['End of File Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['Driver Info Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
} else if (version === 2 || version === 3) {
const offsetSize = view.getUint8(pos++);
this.properties['Size of Offsets'] = offsetSize;
const lengthSize = view.getUint8(pos++);
this.properties['Size of Lengths'] = lengthSize;
this.properties['File Consistency Flags'] = view.getUint8(pos++);
this.properties['Base Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['Superblock Extension Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['End of File Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['Root Group Object Header Address'] = this.getBigInt(view, pos, offsetSize, true); pos += offsetSize;
this.properties['Superblock Checksum'] = view.getUint32(pos, true);
} else {
throw new Error('Unsupported superblock version');
}
}
getBigInt(view, pos, size, littleEndian) {
let val = 0n;
for (let i = 0; i < size; i++) {
val = val * 256n + BigInt(view.getUint8(pos + (littleEndian ? i : size - 1 - i)));
}
return val;
}
printProperties() {
console.log(this.properties);
}
writeMinimal(outputPath) {
const buffer = Buffer.alloc(1024);
const view = new DataView(buffer.buffer);
const sig = [137, 72, 68, 70, 13, 10, 26, 10];
sig.forEach((b, i) => view.setUint8(i, b));
view.setUint8(8, 3); // Version
view.setUint8(9, 8); // Offset size
view.setUint8(10, 8); // Length size
view.setUint8(11, 0); // Flags
this.setBigInt(view, 12, 0n, 8, true); // Base addr
this.setBigInt(view, 20, 18446744073709551615n, 8, true); // Ext addr
this.setBigInt(view, 28, 1024n, 8, true); // EOF
this.setBigInt(view, 36, 96n, 8, true); // Root addr
view.setUint32(44, 0, true); // Checksum
fs.writeFileSync(outputPath, buffer.subarray(0, 48));
}
setBigInt(view, pos, val, size, littleEndian) {
for (let i = 0; i < size; i++) {
view.setUint8(pos + (littleEndian ? i : size - 1 - i), Number(val & 255n));
val >>= 8n;
}
}
}
- The following C++ class provides functionality to open an HDF5 file, decode and read the superblock properties, print them to the console, and includes a basic write method to recreate a minimal superblock in a new file.
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdint>
#include <cstring>
#include <map>
#include <string>
class HDF5Parser {
private:
std::string filepath;
std::map<std::string, uint64_t> properties; // Using uint64_t for simplicity; adjust for strings/large ints
public:
HDF5Parser(const std::string& fp) : filepath(fp) {}
void readAndDecode() {
std::ifstream file(filepath, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file");
}
std::vector<uint8_t> data(1024);
file.read(reinterpret_cast<char*>(data.data()), 1024);
uint8_t sig[8];
std::memcpy(sig, data.data(), 8);
if (sig[0] != 137 || sig[1] != 72 || sig[2] != 68 || sig[3] != 70 ||
sig[4] != 13 || sig[5] != 10 || sig[6] != 26 || sig[7] != 10) {
throw std::runtime_error("Invalid HDF5 signature");
}
uint8_t version = data[8];
properties["Superblock Version"] = version;
size_t pos = 9;
if (version == 0 || version == 1) {
properties["Free Space Version"] = data[pos++];
properties["Root Group Version"] = data[pos++];
pos++; // Reserved
properties["Shared Header Message Version"] = data[pos++];
uint8_t offsetSize = data[pos++];
properties["Size of Offsets"] = offsetSize;
uint8_t lengthSize = data[pos++];
properties["Size of Lengths"] = lengthSize;
pos++; // Reserved
uint16_t leafK; std::memcpy(&leafK, &data[pos], 2); properties["Group Leaf Node K"] = leafK; pos += 2;
uint16_t internalK; std::memcpy(&internalK, &data[pos], 2); properties["Group Internal Node K"] = internalK; pos += 2;
if (version == 0) {
pos += 4; // Reserved
} else {
uint32_t flags; std::memcpy(&flags, &data[pos], 4); properties["File Consistency Flags"] = flags; pos += 4;
}
properties["Base Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["Free Space Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["End of File Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["Driver Info Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
} else if (version == 2 || version == 3) {
uint8_t offsetSize = data[pos++];
properties["Size of Offsets"] = offsetSize;
uint8_t lengthSize = data[pos++];
properties["Size of Lengths"] = lengthSize;
properties["File Consistency Flags"] = data[pos++];
properties["Base Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["Superblock Extension Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["End of File Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
properties["Root Group Object Header Address"] = getUInt64(&data[pos], offsetSize); pos += offsetSize;
uint32_t checksum; std::memcpy(&checksum, &data[pos], 4); properties["Superblock Checksum"] = checksum;
} else {
throw std::runtime_error("Unsupported superblock version");
}
}
uint64_t getUInt64(const uint8_t* ptr, size_t size) {
uint64_t val = 0;
for (size_t i = 0; i < size; ++i) {
val |= static_cast<uint64_t>(ptr[i]) << (i * 8);
}
return val;
}
void printProperties() {
for (const auto& p : properties) {
std::cout << p.first << ": " << p.second << std::endl;
}
}
void writeMinimal(const std::string& outputPath) {
std::ofstream file(outputPath, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot create file");
}
uint8_t sig[8] = {137, 72, 68, 70, 13, 10, 26, 10};
file.write(reinterpret_cast<char*>(sig), 8);
file.put(3); // Version
file.put(8); // Offset size
file.put(8); // Length size
file.put(0); // Flags
setUInt64(file, 0, 8); // Base addr
setUInt64(file, 0xFFFFFFFFFFFFFFFFULL, 8); // Ext addr
setUInt64(file, 1024, 8); // EOF
setUInt64(file, 96, 8); // Root addr
uint32_t checksum = 0;
file.write(reinterpret_cast<char*>(&checksum), 4);
}
void setUInt64(std::ofstream& file, uint64_t val, size_t size) {
for (size_t i = 0; i < size; ++i) {
file.put(static_cast<uint8_t>(val >> (i * 8)));
}
}
};