Task 112: .CRAI File Format

Task 112: .CRAI File Format

1. List of all properties of the .CRAI file format

The .CRAI file format is an external index for CRAM files, used for random access to slices in coordinate-sorted CRAM data. It is a gzipped tab-delimited text file where each line represents an index entry for a slice (or multiple entries for multi-reference slices). The intrinsic properties (fields) of each entry are:

  • Reference sequence ID: An integer representing the reference sequence identifier (ITF8 encoded in CRAM context; for unmapped slices, typically -1; for multi-reference slices, actual IDs from the RI data series).
  • Alignment start: An integer indicating the start position of the alignment (ignored for unmapped slices, recommended to be 0).
  • Alignment span: An integer indicating the length of the alignment (ignored for unmapped slices, recommended to be 0).
  • Absolute byte offset of container header: A 64-bit integer for the byte offset from the start of the CRAM file to the container header.
  • Relative byte offset of slice header block: An integer for the byte offset from the end of the container header to the slice header (matches the "landmark" in the container header).
  • Slice size in bytes: An integer for the total size of the slice, including header and blocks.

Additional format properties:

  • The file is gzipped (compressed with gzip).
  • Entries are tab-separated, with one line per indexed slice (or multiple for multi-reference slices).
  • The index covers all slices in the CRAM file for comprehensive random access.
  • Used for coordinate-sorted files; construction is lightweight, based on container headers, with optional reading of RI data series for multi-reference containers.

3. Ghost blog embedded HTML JavaScript for drag and drop .CRAI file dump

Below is a standalone HTML file with embedded JavaScript that allows drag-and-drop of a .CRAI file. It decompresses the gzip using the pako library (included via CDN for simplicity), parses the tab-delimited entries, and dumps all properties to the screen in a readable format. Save this as an HTML file and open in a browser.

CRAI File Dumper

Drag and Drop .CRAI File

Drop .CRAI file here

4. Python class for .CRAI handling

import gzip

class CRAIHandler:
    def __init__(self, filepath=None):
        self.entries = []
        if filepath:
            self.read(filepath)

    def read(self, filepath):
        self.entries = []
        with gzip.open(filepath, 'rt') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) == 6:
                    entry = {
                        'ref_id': int(fields[0]),
                        'start': int(fields[1]),
                        'span': int(fields[2]),
                        'container_offset': int(fields[3]),
                        'slice_offset': int(fields[4]),
                        'slice_size': int(fields[5])
                    }
                    self.entries.append(entry)
        print(f"Read {len(self.entries)} entries from {filepath}")

    def decode(self):
        # Decoding is inherent in reading as it's text-based; this just prints
        self.print_properties()

    def write(self, filepath):
        with gzip.open(filepath, 'wt') as f:
            for entry in self.entries:
                line = '\t'.join(map(str, [
                    entry['ref_id'],
                    entry['start'],
                    entry['span'],
                    entry['container_offset'],
                    entry['slice_offset'],
                    entry['slice_size']
                ])) + '\n'
                f.write(line)
        print(f"Wrote {len(self.entries)} entries to {filepath}")

    def print_properties(self):
        for i, entry in enumerate(self.entries, 1):
            print(f"Entry {i}:")
            print(f"  Reference sequence ID: {entry['ref_id']}")
            print(f"  Alignment start: {entry['start']}")
            print(f"  Alignment span: {entry['span']}")
            print(f"  Absolute byte offset of container header: {entry['container_offset']}")
            print(f"  Relative byte offset of slice header block: {entry['slice_offset']}")
            print(f"  Slice size in bytes: {entry['slice_size']}")
            print()

# Example usage:
# handler = CRAIHandler('example.crai')
# handler.decode()
# handler.write('output.crai')

5. Java class for .CRAI handling

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class CRAIHandler {
    private List<Entry> entries = new ArrayList<>();

    static class Entry {
        long refId;
        long start;
        long span;
        long containerOffset;
        long sliceOffset;
        long sliceSize;

        Entry(long refId, long start, long span, long containerOffset, long sliceOffset, long sliceSize) {
            this.refId = refId;
            this.start = start;
            this.span = span;
            this.containerOffset = containerOffset;
            this.sliceOffset = sliceOffset;
            this.sliceSize = sliceSize;
        }
    }

    public CRAIHandler(String filepath) throws IOException {
        read(filepath);
    }

    public CRAIHandler() {}

    public void read(String filepath) throws IOException {
        entries.clear();
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(filepath))))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] fields = line.trim().split("\t");
                if (fields.length == 6) {
                    Entry entry = new Entry(
                        Long.parseLong(fields[0]),
                        Long.parseLong(fields[1]),
                        Long.parseLong(fields[2]),
                        Long.parseLong(fields[3]),
                        Long.parseLong(fields[4]),
                        Long.parseLong(fields[5])
                    );
                    entries.add(entry);
                }
            }
        }
        System.out.println("Read " + entries.size() + " entries from " + filepath);
    }

    public void decode() {
        printProperties();
    }

    public void write(String filepath) throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(filepath))))) {
            for (Entry entry : entries) {
                writer.write(entry.refId + "\t" + entry.start + "\t" + entry.span + "\t" +
                             entry.containerOffset + "\t" + entry.sliceOffset + "\t" + entry.sliceSize + "\n");
            }
        }
        System.out.println("Wrote " + entries.size() + " entries to " + filepath);
    }

    public void printProperties() {
        for (int i = 0; i < entries.size(); i++) {
            Entry entry = entries.get(i);
            System.out.println("Entry " + (i + 1) + ":");
            System.out.println("  Reference sequence ID: " + entry.refId);
            System.out.println("  Alignment start: " + entry.start);
            System.out.println("  Alignment span: " + entry.span);
            System.out.println("  Absolute byte offset of container header: " + entry.containerOffset);
            System.out.println("  Relative byte offset of slice header block: " + entry.sliceOffset);
            System.out.println("  Slice size in bytes: " + entry.sliceSize);
            System.out.println();
        }
    }

    // Example usage:
    // public static void main(String[] args) throws IOException {
    //     CRAIHandler handler = new CRAIHandler("example.crai");
    //     handler.decode();
    //     handler.write("output.crai");
    // }
}

6. JavaScript class for .CRAI handling

This uses Node.js (requires 'fs' and 'zlib' modules). Run with Node.js.

const fs = require('fs');
const zlib = require('zlib');

class CRAIHandler {
    constructor(filepath = null) {
        this.entries = [];
        if (filepath) {
            this.read(filepath);
        }
    }

    read(filepath) {
        const data = zlib.gunzipSync(fs.readFileSync(filepath)).toString('utf-8');
        const lines = data.trim().split('\n');
        this.entries = lines.map(line => {
            const [refId, start, span, containerOffset, sliceOffset, sliceSize] = line.split('\t').map(Number);
            return { refId, start, span, containerOffset, sliceOffset, sliceSize };
        });
        console.log(`Read ${this.entries.length} entries from ${filepath}`);
    }

    decode() {
        this.printProperties();
    }

    write(filepath) {
        let data = '';
        this.entries.forEach(entry => {
            data += `${entry.refId}\t${entry.start}\t${entry.span}\t${entry.containerOffset}\t${entry.sliceOffset}\t${entry.sliceSize}\n`;
        });
        fs.writeFileSync(filepath, zlib.gzipSync(data));
        console.log(`Wrote ${this.entries.length} entries to ${filepath}`);
    }

    printProperties() {
        this.entries.forEach((entry, index) => {
            console.log(`Entry ${index + 1}:`);
            console.log(`  Reference sequence ID: ${entry.refId}`);
            console.log(`  Alignment start: ${entry.start}`);
            console.log(`  Alignment span: ${entry.span}`);
            console.log(`  Absolute byte offset of container header: ${entry.containerOffset}`);
            console.log(`  Relative byte offset of slice header block: ${entry.sliceOffset}`);
            console.log(`  Slice size in bytes: ${entry.sliceSize}`);
            console.log('');
        });
    }
}

// Example usage:
// const handler = new CRAIHandler('example.crai');
// handler.decode();
// handler.write('output.crai');

7. C++ class for .CRAI handling

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <zlib.h>  // Requires zlib library

struct CRAIEntry {
    long long ref_id;
    long long start;
    long long span;
    long long container_offset;
    long long slice_offset;
    long long slice_size;
};

class CRAIHandler {
private:
    std::vector<CRAIEntry> entries;

    std::string readGzippedFile(const std::string& filepath) {
        std::ifstream file(filepath, std::ios::binary);
        if (!file) {
            throw std::runtime_error("Cannot open file: " + filepath);
        }
        std::vector<char> compressed(std::istreambuf_iterator<char>(file), {});
        file.close();

        z_stream zs;
        memset(&zs, 0, sizeof(zs));
        if (inflateInit2(&zs, 16 + MAX_WBITS) != Z_OK) {
            throw std::runtime_error("inflateInit failed");
        }
        zs.next_in = reinterpret_cast<Bytef*>(compressed.data());
        zs.avail_in = compressed.size();

        std::string decompressed;
        char buffer[1024];
        int ret;
        do {
            zs.next_out = reinterpret_cast<Bytef*>(buffer);
            zs.avail_out = sizeof(buffer);
            ret = inflate(&zs, 0);
            if (decompressed.size() < zs.total_out) {
                decompressed.append(buffer, zs.total_out - decompressed.size());
            }
        } while (ret == Z_OK);
        inflateEnd(&zs);

        if (ret != Z_STREAM_END) {
            throw std::runtime_error("inflate failed");
        }
        return decompressed;
    }

    void writeGzippedFile(const std::string& filepath, const std::string& data) {
        std::ofstream file(filepath, std::ios::binary);
        if (!file) {
            throw std::runtime_error("Cannot open file for writing: " + filepath);
        }

        z_stream zs;
        memset(&zs, 0, sizeof(zs));
        if (deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 15 | 16, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
            throw std::runtime_error("deflateInit failed");
        }
        zs.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(data.data()));
        zs.avail_in = data.size();

        char buffer[1024];
        int ret;
        do {
            zs.next_out = reinterpret_cast<Bytef*>(buffer);
            zs.avail_out = sizeof(buffer);
            ret = deflate(&zs, Z_FINISH);
            file.write(buffer, sizeof(buffer) - zs.avail_out);
        } while (ret == Z_OK);
        deflateEnd(&zs);

        if (ret != Z_STREAM_END) {
            throw std::runtime_error("deflate failed");
        }
        file.close();
    }

public:
    CRAIHandler(const std::string& filepath = "") {
        if (!filepath.empty()) {
            read(filepath);
        }
    }

    void read(const std::string& filepath) {
        entries.clear();
        std::string decompressed = readGzippedFile(filepath);
        std::istringstream iss(decompressed);
        std::string line;
        while (std::getline(iss, line)) {
            std::istringstream lineStream(line);
            CRAIEntry entry;
            char delim;
            lineStream >> entry.ref_id >> delim >> entry.start >> delim >> entry.span >> delim >>
                          entry.container_offset >> delim >> entry.slice_offset >> delim >> entry.slice_size;
            if (lineStream) {
                entries.push_back(entry);
            }
        }
        std::cout << "Read " << entries.size() << " entries from " << filepath << std::endl;
    }

    void decode() {
        printProperties();
    }

    void write(const std::string& filepath) {
        std::ostringstream oss;
        for (const auto& entry : entries) {
            oss << entry.ref_id << "\t" << entry.start << "\t" << entry.span << "\t"
                << entry.container_offset << "\t" << entry.slice_offset << "\t" << entry.slice_size << "\n";
        }
        writeGzippedFile(filepath, oss.str());
        std::cout << "Wrote " << entries.size() << " entries to " << filepath << std::endl;
    }

    void printProperties() {
        for (size_t i = 0; i < entries.size(); ++i) {
            const auto& entry = entries[i];
            std::cout << "Entry " << (i + 1) << ":" << std::endl;
            std::cout << "  Reference sequence ID: " << entry.ref_id << std::endl;
            std::cout << "  Alignment start: " << entry.start << std::endl;
            std::cout << "  Alignment span: " << entry.span << std::endl;
            std::cout << "  Absolute byte offset of container header: " << entry.container_offset << std::endl;
            std::cout << "  Relative byte offset of slice header block: " << entry.slice_offset << std::endl;
            std::cout << "  Slice size in bytes: " << entry.slice_size << std::endl;
            std::cout << std::endl;
        }
    }
};

// Example usage:
// int main() {
//     CRAIHandler handler("example.crai");
//     handler.decode();
//     handler.write("output.crai");
//     return 0;
// }