Task 232: .FNA File Format

Task 232: .FNA File Format

The .FNA file format is a text-based FASTA format specifically used for representing nucleic acid (nucleotide) sequences, often in bioinformatics contexts such as genomic data from databases like NCBI. It is not a binary format and has no strict "file system" intrinsics beyond standard text file attributes, but its format-specific properties are derived from the FASTA specification.

Based on the FASTA specification for nucleotide sequences (as .FNA files adhere to this without unique extensions beyond denoting nucleic acids), the intrinsic properties of the format include:

  • Text-based (ASCII-compatible) structure, allowing manipulation with standard text tools.
  • Sequences are multi-line, with each sequence optionally interleaved (split across lines) or sequential (single line).
  • Header (defline) lines start with a '>' character, followed by an identifier (unique ID, possibly with database prefixes like 'gi|' or 'ref|') and an optional space-separated description.
  • Sequence data lines immediately follow the header, consisting of single-letter nucleotide codes, typically wrapped at 80 characters per line (though up to 120 is historically allowed; no strict enforcement).
  • Allowed characters in sequences: A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, - (gap); case-insensitive (lowercase mapped to uppercase); invalid characters like digits or spaces are ignored or may cause parsing errors.
  • Supports multiple sequences per file (multi-FASTA), with each new sequence starting on a new '>' line.
  • No formal version number, magic number, or binary header; identification relies on content structure.
  • Gaps represented by '-' for alignments; no built-in compression or encryption.
  • File extension typically .fna to indicate nucleotide content (distinguishing from .faa for amino acids or other variants).

Two direct download links for .FNA files (these are gzipped for efficiency, as is common for large genomic data; uncompress to get the raw .fna file):

Below is a standalone HTML page with embedded JavaScript that can be hosted on a Ghost blog (or any static site) as an embedded snippet. It allows drag-and-drop of a .FNA file and dumps the parsed properties to the screen (number of sequences, and for each: identifier, description, sequence length, and full sequence if small; truncated for large sequences to avoid overwhelming the display). It uses the FileReader API for browser-based parsing.

FNA File Parser
Drag and drop .FNA file here
  1. Below is a Python class that can open, decode (parse), read, write (encode back to .FNA format), and print to console the properties from the list in part 1, along with parsed file-specific details (e.g., number of sequences, per-sequence identifier/description/length) for context, as the static properties are format-wide but demonstrated via the file.
import os

class FNAHandler:
    def __init__(self, filepath):
        self.filepath = filepath
        self.sequences = []
        self.static_properties = [
            "Text-based (ASCII-compatible) structure",
            "Header lines start with '>'",
            "Identifier after '>', space-separated from description",
            "Sequence lines follow header, typically <=80 chars per line",
            "Allowed characters: A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, -",
            "Case-insensitive (lowercase mapped to uppercase)",
            "Supports multiple sequences",
            "No version number or binary header"
        ]

    def read_and_decode(self):
        with open(self.filepath, 'r') as f:
            content = f.read()
        lines = content.splitlines()
        current_seq = None
        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if current_seq:
                    self.sequences.append(current_seq)
                header = line[1:].strip()
                id_desc = header.split(maxsplit=1)
                identifier = id_desc[0]
                description = id_desc[1] if len(id_desc) > 1 else ''
                current_seq = {'identifier': identifier, 'description': description, 'sequence': ''}
            elif current_seq and line:
                current_seq['sequence'] += line.upper()
        if current_seq:
            self.sequences.append(current_seq)

    def print_properties(self):
        print("Static Format Properties:")
        for prop in self.static_properties:
            print(f"- {prop}")
        print("\nFile-Specific Parsed Properties:")
        print(f"Number of sequences: {len(self.sequences)}")
        for i, seq in enumerate(self.sequences, 1):
            print(f"Sequence {i}:")
            print(f"  Identifier: {seq['identifier']}")
            print(f"  Description: {seq['description']}")
            print(f"  Length: {len(seq['sequence'])}")

    def write(self, output_path):
        with open(output_path, 'w') as f:
            for seq in self.sequences:
                f.write(f">{seq['identifier']} {seq['description']}\n")
                seq_str = seq['sequence']
                for i in range(0, len(seq_str), 80):
                    f.write(seq_str[i:i+80] + '\n')

# Example usage:
# handler = FNAHandler('example.fna')
# handler.read_and_decode()
# handler.print_properties()
# handler.write('output.fna')
  1. Below is a Java class that performs the same: open, decode/parse, read, write/encode, and print to console the properties.
import java.io.*;
import java.util.ArrayList;
import java.util.List;

public class FNAHandler {
    private String filepath;
    private List<Sequence> sequences = new ArrayList<>();
    private String[] staticProperties = {
        "Text-based (ASCII-compatible) structure",
        "Header lines start with '>'",
        "Identifier after '>', space-separated from description",
        "Sequence lines follow header, typically <=80 chars per line",
        "Allowed characters: A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, -",
        "Case-insensitive (lowercase mapped to uppercase)",
        "Supports multiple sequences",
        "No version number or binary header"
    };

    static class Sequence {
        String identifier;
        String description;
        String sequence;
    }

    public FNAHandler(String filepath) {
        this.filepath = filepath;
    }

    public void readAndDecode() throws IOException {
        try (BufferedReader reader = new BufferedReader(new FileReader(filepath))) {
            String line;
            Sequence currentSeq = null;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.startsWith(">")) {
                    if (currentSeq != null) {
                        sequences.add(currentSeq);
                    }
                    String header = line.substring(1).trim();
                    String[] idDesc = header.split("\\s+", 2);
                    currentSeq = new Sequence();
                    currentSeq.identifier = idDesc[0];
                    currentSeq.description = idDesc.length > 1 ? idDesc[1] : "";
                    currentSeq.sequence = "";
                } else if (currentSeq != null && !line.isEmpty()) {
                    currentSeq.sequence += line.toUpperCase();
                }
            }
            if (currentSeq != null) {
                sequences.add(currentSeq);
            }
        }
    }

    public void printProperties() {
        System.out.println("Static Format Properties:");
        for (String prop : staticProperties) {
            System.out.println("- " + prop);
        }
        System.out.println("\nFile-Specific Parsed Properties:");
        System.out.println("Number of sequences: " + sequences.size());
        for (int i = 0; i < sequences.size(); i++) {
            Sequence seq = sequences.get(i);
            System.out.println("Sequence " + (i + 1) + ":");
            System.out.println("  Identifier: " + seq.identifier);
            System.out.println("  Description: " + seq.description);
            System.out.println("  Length: " + seq.sequence.length());
        }
    }

    public void write(String outputPath) throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath))) {
            for (Sequence seq : sequences) {
                writer.write(">" + seq.identifier + " " + seq.description + "\n");
                String seqStr = seq.sequence;
                for (int i = 0; i < seqStr.length(); i += 80) {
                    writer.write(seqStr.substring(i, Math.min(i + 80, seqStr.length())) + "\n");
                }
            }
        }
    }

    // Example usage:
    // public static void main(String[] args) throws IOException {
    //     FNAHandler handler = new FNAHandler("example.fna");
    //     handler.readAndDecode();
    //     handler.printProperties();
    //     handler.write("output.fna");
    // }
}
  1. Below is a JavaScript class (Node.js compatible, using 'fs' module) for the same functionality: open, decode/parse, read, write/encode, and print to console.
const fs = require('fs');

class FNAHandler {
    constructor(filepath) {
        this.filepath = filepath;
        this.sequences = [];
        this.staticProperties = [
            'Text-based (ASCII-compatible) structure',
            'Header lines start with \'>\'',
            'Identifier after \'>\', space-separated from description',
            'Sequence lines follow header, typically <=80 chars per line',
            'Allowed characters: A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, -',
            'Case-insensitive (lowercase mapped to uppercase)',
            'Supports multiple sequences',
            'No version number or binary header'
        ];
    }

    readAndDecode() {
        const content = fs.readFileSync(this.filepath, 'utf8');
        const lines = content.split('\n');
        let currentSeq = null;
        lines.forEach(line => {
            line = line.trim();
            if (line.startsWith('>')) {
                if (currentSeq) this.sequences.push(currentSeq);
                const header = line.slice(1).trim();
                const [identifier, ...descParts] = header.split(/\s+/);
                currentSeq = {
                    identifier,
                    description: descParts.join(' '),
                    sequence: ''
                };
            } else if (currentSeq && line) {
                currentSeq.sequence += line.toUpperCase();
            }
        });
        if (currentSeq) this.sequences.push(currentSeq);
    }

    printProperties() {
        console.log('Static Format Properties:');
        this.staticProperties.forEach(prop => console.log(`- ${prop}`));
        console.log('\nFile-Specific Parsed Properties:');
        console.log(`Number of sequences: ${this.sequences.length}`);
        this.sequences.forEach((seq, index) => {
            console.log(`Sequence ${index + 1}:`);
            console.log(`  Identifier: ${seq.identifier}`);
            console.log(`  Description: ${seq.description}`);
            console.log(`  Length: ${seq.sequence.length}`);
        });
    }

    write(outputPath) {
        let output = '';
        this.sequences.forEach(seq => {
            output += `>${seq.identifier} ${seq.description}\n`;
            const seqStr = seq.sequence;
            for (let i = 0; i < seqStr.length; i += 80) {
                output += seqStr.slice(i, i + 80) + '\n';
            }
        });
        fs.writeFileSync(outputPath, output);
    }
}

// Example usage:
// const handler = new FNAHandler('example.fna');
// handler.readAndDecode();
// handler.printProperties();
// handler.write('output.fna');
  1. Below is a C++ class (since standard C lacks classes; using C++ for object-oriented structure) for the same: open, decode/parse, read, write/encode, and print to console (stdout).
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>

class FNAHandler {
private:
    std::string filepath;
    struct Sequence {
        std::string identifier;
        std::string description;
        std::string sequence;
    };
    std::vector<Sequence> sequences;
    std::vector<std::string> staticProperties = {
        "Text-based (ASCII-compatible) structure",
        "Header lines start with '>'",
        "Identifier after '>', space-separated from description",
        "Sequence lines follow header, typically <=80 chars per line",
        "Allowed characters: A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, -",
        "Case-insensitive (lowercase mapped to uppercase)",
        "Supports multiple sequences",
        "No version number or binary header"
    };

public:
    FNAHandler(const std::string& fp) : filepath(fp) {}

    void readAndDecode() {
        std::ifstream file(filepath);
        if (!file) {
            std::cerr << "Error opening file." << std::endl;
            return;
        }
        std::string line;
        Sequence currentSeq;
        bool hasCurrent = false;
        while (std::getline(file, line)) {
            line.erase(0, line.find_first_not_of(" \t")); // trim left
            line.erase(line.find_last_not_of(" \t") + 1); // trim right
            if (line.empty()) continue;
            if (line[0] == '>') {
                if (hasCurrent) {
                    sequences.push_back(currentSeq);
                }
                std::string header = line.substr(1);
                size_t spacePos = header.find(' ');
                currentSeq.identifier = (spacePos != std::string::npos) ? header.substr(0, spacePos) : header;
                currentSeq.description = (spacePos != std::string::npos) ? header.substr(spacePos + 1) : "";
                currentSeq.sequence = "";
                hasCurrent = true;
            } else if (hasCurrent) {
                std::transform(line.begin(), line.end(), line.begin(), ::toupper);
                currentSeq.sequence += line;
            }
        }
        if (hasCurrent) {
            sequences.push_back(currentSeq);
        }
        file.close();
    }

    void printProperties() {
        std::cout << "Static Format Properties:" << std::endl;
        for (const auto& prop : staticProperties) {
            std::cout << "- " << prop << std::endl;
        }
        std::cout << "\nFile-Specific Parsed Properties:" << std::endl;
        std::cout << "Number of sequences: " << sequences.size() << std::endl;
        for (size_t i = 0; i < sequences.size(); ++i) {
            const auto& seq = sequences[i];
            std::cout << "Sequence " << (i + 1) << ":" << std::endl;
            std::cout << "  Identifier: " << seq.identifier << std::endl;
            std::cout << "  Description: " << seq.description << std::endl;
            std::cout << "  Length: " << seq.sequence.length() << std::endl;
        }
    }

    void write(const std::string& outputPath) {
        std::ofstream outFile(outputPath);
        if (!outFile) {
            std::cerr << "Error opening output file." << std::endl;
            return;
        }
        for (const auto& seq : sequences) {
            outFile << ">" << seq.identifier << " " << seq.description << "\n";
            const std::string& seqStr = seq.sequence;
            for (size_t i = 0; i < seqStr.length(); i += 80) {
                outFile << seqStr.substr(i, 80) << "\n";
            }
        }
        outFile.close();
    }
};

// Example usage:
// int main() {
//     FNAHandler handler("example.fna");
//     handler.readAndDecode();
//     handler.printProperties();
//     handler.write("output.fna");
//     return 0;
// }