Task 224: .FFN File Format
Task 224: .FFN File Format
File Format Specifications for the .FFN File Format
The .FFN file format is an extension used for FASTA files specifically containing nucleotide sequences of annotated features (e.g., coding sequences or genes) from genomic data, commonly in bioinformatics contexts (e.g., from NCBI or genome annotation tools). It follows the standard FASTA format, which is a text-based format for representing nucleotide or protein sequences. The specifications are as per the FASTA format definition, as described in sources like Wikipedia and NCBI documentation:
The file is ASCII text.
Each sequence starts with a header line beginning with a '>' character, followed by a unique identifier (SeqID) and optional description, ended by a newline.
Following the header are one or more lines of sequence data, typically wrapped at 60-80 characters per line for readability, but this is not strict.
Sequence characters are from the IUB/IUPAC nucleotide codes (A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, -).
Multiple sequences can be included in one file, each starting with its own header.
No fixed header or footer; the file ends after the last sequence.
No byte order mark or magic number; identification is by content (starts with ' >') and extension.
There is no formal binary structure, as it is a plain text format. For NCBI-specific .FFN files, the header often includes locus tag, gene name, product, and location in the format: >locus_tag [gene=gene_name] [product=product_description] [location=position].
- List of all the properties of this file format intrinsic to its file system:
File type: Text (ASCII/UTF-8)
Extension: .ffn
MIME type: text/plain or application/x-fasta
Structure: Unstructured text with alternating headers and sequence lines
Header prefix: '>'
Header content: Sequence identifier (mandatory) and description (optional), separated by space
Sequence data: Lines of nucleotide characters, no specific length requirement but typically wrapped
Allowed characters in sequence: A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, - (case-insensitive, usually upper)
Multi-sequence support: Yes
Line endings: CR/LF or LF (platform-dependent, but format is tolerant)
No binary components, no endianness, no magic bytes
Intrinsic file system properties: Regular file (not directory, not symlink), readable as text, size variable based on content, creation/modified dates as per file system
- Two direct download links for files of format .FFN:
https://raw.githubusercontent.com/phac-nml/ecoli_vf/master/data/repaired_ecoli_vfs.ffn (E. coli virulence factors in FASTA format)
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/725/GCF_000007725.1_ASM772v1/GCF_000007725.1_ASM772v1_cds_from_genomic.fna (Nanoarchaeum equitans CDS nucleotide sequences; note: NCBI uses .fna for the same FASTA content as traditional .ffn)
- Ghost blog embedded HTML JavaScript for drag and drop to dump properties:
- Python class for .FFN file:
import os
class FFNHandler:
def __init__(self, filepath):
self.filepath = filepath
self.properties = {
'fileType': 'Text (ASCII/UTF-8)',
'extension': '.ffn',
'mimeType': 'text/plain or application/x-fasta',
'structure': 'Unstructured text with alternating headers and sequence lines',
'headerPrefix': '>',
'headerContent': 'Sequence identifier (mandatory) and description (optional)',
'sequenceData': 'Lines of nucleotide characters',
'allowedCharacters': 'A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, -',
'multiSequenceSupport': 'Yes',
'lineEndings': 'CR/LF or LF',
'binaryComponents': 'No',
'endianness': 'N/A',
'magicBytes': 'No',
'intrinsicFileSystemProperties': 'Regular file, readable as text, size variable, creation/modified dates per FS'
}
def read(self):
with open(self.filepath, 'r') as f:
content = f.read()
# Decode (parse) FASTA
sequences = []
current_seq = ''
current_header = ''
for line in content.splitlines():
if line.startsWith('>'):
if current_header:
sequences.append((current_header, current_seq))
current_header = line[1:].strip()
current_seq = ''
else:
current_seq += line.strip()
if current_header:
sequences.append((current_header, current_seq))
self.properties['numSequences'] = len(sequences)
self.properties['totalSequenceLength'] = sum(len(seq) for _, seq in sequences)
return sequences
def print_properties(self):
for key, value in self.properties.items():
print(f"{key}: {value}")
def write(self, sequences, new_filepath=None):
filepath = new_filepath or self.filepath
with open(filepath, 'w') as f:
for header, seq in sequences:
f.write(f'>{header}\n')
for i in range(0, len(seq), 80):
f.write(seq[i:i+80] + '\n')
# Example usage
if __name__ == '__main__':
handler = FFNHandler('example.ffn')
sequences = handler.read()
handler.print_properties()
# To write
handler.write(sequences, 'output.ffn')
- Java class for .FFN file:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class FFNHandler {
private String filepath;
private Map<String, Object> properties = new HashMap<>();
public FFNHandler(String filepath) {
this.filepath = filepath;
properties.put("fileType", "Text (ASCII/UTF-8)");
properties.put("extension", ".ffn");
properties.put("mimeType", "text/plain or application/x-fasta");
properties.put("structure", "Unstructured text with alternating headers and sequence lines");
properties.put("headerPrefix", ">");
properties.put("headerContent", "Sequence identifier (mandatory) and description (optional)");
properties.put("sequenceData", "Lines of nucleotide characters");
properties.put("allowedCharacters", "A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, -");
properties.put("multiSequenceSupport", "Yes");
properties.put("lineEndings", "CR/LF or LF");
properties.put("binaryComponents", "No");
properties.put("endianness", "N/A");
properties.put("magicBytes", "No");
properties.put("intrinsicFileSystemProperties", "Regular file, readable as text, size variable, creation/modified dates per FS");
}
public List<String[]> read() throws IOException {
List<String[]> sequences = new ArrayList<>();
StringBuilder content = new StringBuilder();
try (BufferedReader br = new BufferedReader(new FileReader(filepath))) {
String line;
while (line = br.readLine() != null) {
content.append(line).append("\n");
}
}
// Parse
String[] lines = content.toString().split("\n");
String currentHeader = "";
StringBuilder currentSeq = new StringBuilder();
for (String line : lines) {
if (line.startsWith(">")) {
if (!currentHeader.isEmpty()) {
sequences.add(new String[]{currentHeader, currentSeq.toString()});
}
currentHeader = line.substring(1).trim();
currentSeq = new StringBuilder();
} else {
currentSeq.append(line.trim());
}
}
if (!currentHeader.isEmpty()) {
sequences.add(new String[]{currentHeader, currentSeq.toString()});
}
properties.put("numSequences", sequences.size());
properties.put("totalSequenceLength", sequences.stream().mapToInt(s -> s[1].length()).sum());
return sequences;
}
public void printProperties() {
properties.forEach((key, value) -> System.out.println(key + ": " + value));
}
public void write(List<String[]> sequences, String newFilepath) throws IOException {
String path = newFilepath != null ? newFilepath : filepath;
try (FileWriter fw = new FileWriter(path)) {
for (String[] seq : sequences {
fw.write(">" + seq[0] + "\n");
String s = seq[1];
for (int i = 0; i < s.length(); i += 80) {
fw.write(s.substring(i, Math.min(i + 80, s.length())) + "\n");
}
}
}
}
public static void main(String[] args) throws IOException {
FFNHandler handler = new FFNHandler("example.ffn");
List<String[]> sequences = handler.read();
handler.printProperties();
handler.write(sequences, "output.ffn");
}
}
- JavaScript class for .FFN file:
class FFNHandler {
constructor(filepath) {
this.filepath = filepath;
this.properties = {
fileType: 'Text (ASCII/UTF-8)',
extension: '.ffn',
mimeType: 'text/plain or application/x-fasta',
structure: 'Unstructured text with alternating headers and sequence lines',
headerPrefix: '>',
headerContent: 'Sequence identifier (mandatory) and description (optional)',
sequenceData: 'Lines of nucleotide characters',
allowedCharacters: 'A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, -',
multiSequenceSupport: 'Yes',
lineEndings: 'CR/LF or LF',
binaryComponents: 'No',
endianness: 'N/A',
magicBytes: 'No',
intrinsicFileSystemProperties: 'Regular file, readable as text, size variable, creation/modified dates per FS'
};
}
async read() {
const response = await fetch(this.filepath);
const content = await response.text();
const lines = content.split(/\r?\n/);
const sequences = [];
let currentHeader = '';
let currentSeq = '';
lines.forEach(line => {
if (line.startsWith('>')) {
if (currentHeader) {
sequences.push({header: currentHeader, seq: currentSeq});
}
currentHeader = line.slice(1).trim();
currentSeq = '';
} else {
currentSeq += line.trim();
}
});
if (currentHeader) {
sequences.push({header: currentHeader, seq: currentSeq});
}
this.properties.numSequences = sequences.length;
this.properties.totalSequenceLength = sequences.reduce((sum, s) => sum + s.seq.length, 0);
return sequences;
}
printProperties() {
console.log(JSON.stringify(this.properties, null, 2));
}
write(sequences, newFilepath) {
let content = '';
sequences.forEach(s => {
content += `>${s.header}\n`;
for (let i = 0; i < s.seq.length; i += 80) {
content += s.seq.slice(i, i + 80) + '\n';
}
});
// For browser, use Blob to download
const blob = new Blob([content], {type: 'text/plain'});
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = newFilepath || 'output.ffn';
a.click();
}
}
// Example usage
(async () => {
const handler = new FFNHandler('example.ffn');
const sequences = await handler.read();
handler.printProperties();
handler.write(sequences, 'output.ffn');
}) ();
- C class for .FFN file:
Note: C doesn't have "classes" in the same way, so using a struct with functions.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
char* filepath;
char* fileType;
char* extension;
char* mimeType;
char* structure;
char* headerPrefix;
char* headerContent;
char* sequenceData;
char* allowedCharacters;
char* multiSequenceSupport;
char* lineEndings;
char* binaryComponents;
char* endianness;
char* magicBytes;
char* intrinsicFileSystemProperties;
int numSequences;
int totalSequenceLength;
} FFNHandler;
FFNHandler* createFFNHandler(char* filepath) {
FFNHandler* handler = malloc(sizeof(FFNHandler));
handler->filepath = strdup(filepath);
handler->fileType = "Text (ASCII/UTF-8)";
handler->extension = ".ffn";
handler->mimeType = "text/plain or application/x-fasta";
handler->structure = "Unstructured text with alternating headers and sequence lines";
handler->headerPrefix = ">";
handler->headerContent = "Sequence identifier (mandatory) and description (optional)";
handler->sequenceData = "Lines of nucleotide characters";
handler->allowedCharacters = "A, C, G, T, U, N, R, Y, K, M, S, W, B, D, H, V, -";
handler->multiSequenceSupport = "Yes";
handler->lineEndings = "CR/LF or LF";
handler->binaryComponents = "No";
handler->endianness = "N/A";
handler->magicBytes = "No";
handler->intrinsicFileSystemProperties = "Regular file, readable as text, size variable, creation/modified dates per FS";
handler->numSequences = 0;
handler->totalSequenceLength = 0;
return handler;
}
void destroyFFNHandler(FFNHandler* handler) {
free(handler->filepath);
free(handler);
}
typedef struct {
char* header;
char* seq;
} Sequence;
Sequence* readSequences(FFNHandler* handler, int* count) {
FILE* f = fopen(handler->filepath, "r");
if (!f) return NULL;
char* content = NULL;
long length = 0;
fseek(f, 0, SEEK_END);
length = ftell(f);
fseek(f, 0, SEEK_SET);
content = malloc(length + 1);
fread(content, 1, length, f);
content[length] = '\0';
fclose(f);
Sequence* sequences = NULL;
*count = 0;
char* line = strtok(content, "\n");
char* currentHeader = NULL;
char* currentSeq = malloc(1);
currentSeq[0] = '\0';
while (line) {
if (line[0] == '>') {
if (currentHeader) {
sequences = realloc(sequences, (*count + 1) * sizeof(Sequence));
sequences[*count].header = currentHeader;
sequences[*count].seq = currentSeq;
handler->totalSequenceLength += strlen(currentSeq);
(*count)++;
}
currentHeader = strdup(line + 1);
currentSeq = malloc(1);
currentSeq[0] = '\0';
} else {
currentSeq = realloc(currentSeq, strlen(currentSeq) + strlen(line) + 1);
strcat(currentSeq, line);
}
line = strtok(NULL, "\n");
}
if (currentHeader) {
sequences = realloc(sequences, (*count + 1) * sizeof(Sequence));
sequences[*count].header = currentHeader;
sequences[*count].seq = currentSeq;
handler->totalSequenceLength += strlen(currentSeq);
(*count)++;
}
handler->numSequences = *count;
free(content);
return sequences;
}
void printProperties(FFNHandler* handler) {
printf("fileType: %s\n", handler->fileType);
printf("extension: %s\n", handler->extension);
printf("mimeType: %s\n", handler->mimeType);
printf("structure: %s\n", handler->structure);
printf("headerPrefix: %s\n", handler->headerPrefix);
printf("headerContent: %s\n", handler->headerContent);
printf("sequenceData: %s\n", handler->sequenceData);
printf("allowedCharacters: %s\n", handler->allowedCharacters);
printf("multiSequenceSupport: %s\n", handler->multiSequenceSupport);
printf("lineEndings: %s\n", handler->lineEndings);
printf("binaryComponents: %s\n", handler->binaryComponents);
printf("endianness: %s\n", handler->endianness);
printf("magicBytes: %s\n", handler->magicBytes);
printf("intrinsicFileSystemProperties: %s\n", handler->intrinsicFileSystemProperties);
printf("numSequences: %d\n", handler->numSequences);
printf("totalSequenceLength: %d\n", handler->totalSequenceLength);
}
void writeSequences(FFNHandler* handler, Sequence* sequences, int count, char* newFilepath) {
char* path = newFilepath ? newFilepath : handler->filepath;
FILE* f = fopen(path, "w");
if (!f) return;
for (int i = 0; i < count; i++) {
fprintf(f, ">%s\n", sequences[i].header);
char* seq = sequences[i].seq;
for (size_t j = 0; j < strlen(seq); j += 80) {
char buffer[81];
strncpy(buffer, seq + j, 80);
buffer[80] = '\0';
fprintf(f, "%s\n", buffer);
}
}
fclose(f);
}
int main() {
FFNHandler* handler = createFFNHandler("example.ffn");
int count;
Sequence* sequences = readSequences(handler, &count);
printProperties(handler);
writeSequences(handler, sequences, count, "output.ffn");
// Free memory
for (int i = 0; i < count; i++) {
free(sequences[i].header);
free(sequences[i].seq);
}
free(sequences);
destroyFFNHandler(handler);
return 0;
}