Task 429: .MSF File Format
Task 429: .MSF File Format
File Format Specifications for .MSF
The .MSF file format referred to here is the Multiple Sequence Format used in bioinformatics for multiple sequence alignments, originally from the GCG (Genetics Computer Group) Wisconsin Package. It is a text-based format that includes a header with metadata and the aligned sequences. The specification is based on standard descriptions for this format.
- List of Properties Intrinsic to the File Format
The properties are the key metadata fields in the header of the MSF file, which describe the alignment and individual sequences. These are:
- Alignment Length (MSF): The length of the longest sequence in the alignment (integer).
- Type: The type of sequences, 'P' for protein or 'N' for nucleotide.
- Global Checksum (Check): A checksum value based on ASCII values of sequence characters (integer; often used for verification).
- Sequence Name: Unique identifier for each sequence (string; no spaces, max ~13 characters).
- Sequence Length (Len): The length of each individual sequence (integer).
- Sequence Checksum (Check): Checksum for each sequence (integer).
- Sequence Weight (Weight): Weight assigned to each sequence (float; used in some alignment algorithms).
These properties appear in the header, followed by the actual aligned sequences.
- Two Direct Download Links for .MSF Files
- https://pmc.ncbi.nlm.nih.gov/articles/instance/1770913/bin/1471-2229-7-1-S1.MSF (Supplementary alignment file from a bioinformatics study on plant acyl-ACP thioesterases).
- https://www.researchgate.net/profile/Georg-Hausner-2/publication/295004530_Additional_File_4/data/56cc52dd08ae5488f0dcf577/1471-2148-7-31-S4.msf (Supplementary alignment file from a study on gene sequences).
- Ghost Blog Embedded HTML JavaScript for Drag-and-Drop .MSF File Parser
Here's an HTML page with embedded JavaScript that allows dragging and dropping a .MSF file. It parses the file and dumps the properties to the screen.
Drag and Drop .MSF File
- Python Class for .MSF File Handling
Here's a Python class that can open, decode (parse), read, write, and print the properties to console.
import os
class MSFHandler:
def __init__(self, filepath=None):
self.filepath = filepath
self.alignment_length = None
self.type = None
self.global_checksum = None
self.sequences = [] # list of dicts: {'name': str, 'len': int, 'check': int, 'weight': float, 'sequence': str}
self.alignment_blocks = [] # for full sequences
if filepath:
self.read(filepath)
def parse_header(self, lines):
in_header = True
seq_dict = {}
for line in lines:
line = line.strip()
if line.endswith('..'):
parts = line.split()
for part in parts:
if part.startswith('MSF:'):
self.alignment_length = int(part.split(':')[1])
if part.startswith('Type:'):
self.type = part.split(':')[1]
if part.startswith('Check:'):
self.global_checksum = int(part.split(':')[1])
elif line.startswith('Name:'):
parts = line.split()
name_idx = parts.index('Name:') + 1
len_idx = parts.index('Len:') + 1
check_idx = parts.index('Check:') + 1
weight_idx = parts.index('Weight:') + 1
seq_dict = {
'name': parts[name_idx],
'len': int(parts[len_idx]),
'check': int(parts[check_idx]),
'weight': float(parts[weight_idx]),
'sequence': ''
}
self.sequences.append(seq_dict)
elif line.startswith('//'):
in_header = False
break
return lines[lines.index('//') + 1:] if '//' in ''.join(lines) else lines
def parse_alignment(self, lines):
seq_map = {seq['name']: seq for seq in self.sequences}
for line in lines:
line = line.strip()
if line:
parts = line.split()
name = parts[0]
seq_segment = ''.join(parts[1:])
if name in seq_map:
seq_map[name]['sequence'] += seq_segment.replace('.', '-') # Standardize gaps
def read(self, filepath):
self.filepath = filepath
with open(filepath, 'r') as f:
lines = f.readlines()
remaining_lines = self.parse_header(lines)
self.parse_alignment([l.strip() for l in remaining_lines if l.strip() and not l.startswith('//')])
def print_properties(self):
print(f"Alignment Length: {self.alignment_length}")
print(f"Type: {self.type}")
print(f"Global Checksum: {self.global_checksum}")
for seq in self.sequences:
print(f"Name: {seq['name']}, Length: {seq['len']}, Checksum: {seq['check']}, Weight: {seq['weight']}")
def write(self, new_filepath=None):
if not new_filepath:
new_filepath = self.filepath or 'output.msf'
with open(new_filepath, 'w') as f:
# Write header
f.write(f"MSF: {self.alignment_length} Type: {self.type} Check: {self.global_checksum} ..\n")
for seq in self.sequences:
f.write(f"Name: {seq['name']} Len: {seq['len']} Check: {seq['check']} Weight: {seq['weight']}\n")
f.write("//\n")
# Write alignment (simple, 50 chars per line)
max_name_len = max(len(seq['name']) for seq in self.sequences)
pos = 0
while pos < self.alignment_length:
for seq in self.sequences:
segment = seq['sequence'][pos:pos+50]
f.write(f"{seq['name'].ljust(max_name_len + 1)} {segment}\n")
f.write("\n")
pos += 50
# Example usage:
# msf = MSFHandler('example.msf')
# msf.print_properties()
# msf.write('new.msf')
- Java Class for .MSF File Handling
Here's a Java class that can open, decode, read, write, and print the properties to console.
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class MSFHandler {
private String filepath;
private int alignmentLength;
private String type;
private int globalChecksum;
private List<Sequence> sequences = new ArrayList<>();
static class Sequence {
String name;
int len;
int check;
double weight;
String sequence = "";
}
public MSFHandler(String filepath) throws IOException {
this.filepath = filepath;
read(filepath);
}
private void parseHeader(BufferedReader reader) throws IOException {
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.endsWith("..")) {
String[] parts = line.split("\\s+");
for (String part : parts) {
if (part.startsWith("MSF:")) {
alignmentLength = Integer.parseInt(part.split(":")[1]);
} else if (part.startsWith("Type:")) {
type = part.split(":")[1];
} else if (part.startsWith("Check:")) {
globalChecksum = Integer.parseInt(part.split(":")[1]);
}
}
} else if (line.startsWith("Name:")) {
String[] parts = line.split("\\s+");
Sequence seq = new Sequence();
for (int i = 0; i < parts.length; i++) {
if (parts[i].equals("Name:")) seq.name = parts[++i];
if (parts[i].equals("Len:")) seq.len = Integer.parseInt(parts[++i]);
if (parts[i].equals("Check:")) seq.check = Integer.parseInt(parts[++i]);
if (parts[i].equals("Weight:")) seq.weight = Double.parseDouble(parts[++i]);
}
sequences.add(seq);
} else if (line.startsWith("//")) {
break;
}
}
}
private void parseAlignment(BufferedReader reader) throws IOException {
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.isEmpty()) {
String[] parts = line.split("\\s+");
String name = parts[0];
String seqSegment = String.join("", parts).substring(name.length()).replace(".", "-");
for (Sequence seq : sequences) {
if (seq.name.equals(name)) {
seq.sequence += seqSegment;
break;
}
}
}
}
}
public void read(String filepath) throws IOException {
try (BufferedReader reader = new BufferedReader(new FileReader(filepath))) {
parseHeader(reader);
parseAlignment(reader);
}
}
public void printProperties() {
System.out.println("Alignment Length: " + alignmentLength);
System.out.println("Type: " + type);
System.out.println("Global Checksum: " + globalChecksum);
for (Sequence seq : sequences) {
System.out.println("Name: " + seq.name + ", Length: " + seq.len + ", Checksum: " + seq.check + ", Weight: " + seq.weight);
}
}
public void write(String newFilepath) throws IOException {
try (PrintWriter writer = new PrintWriter(new FileWriter(newFilepath))) {
writer.println("MSF: " + alignmentLength + " Type: " + type + " Check: " + globalChecksum + " ..");
for (Sequence seq : sequences) {
writer.println("Name: " + seq.name + " Len: " + seq.len + " Check: " + seq.check + " Weight: " + seq.weight);
}
writer.println("//");
int maxNameLen = sequences.stream().mapToInt(s -> s.name.length()).max().orElse(0);
int pos = 0;
while (pos < alignmentLength) {
for (Sequence seq : sequences) {
String segment = seq.sequence.substring(pos, Math.min(pos + 50, alignmentLength));
writer.println(seq.name + " ".repeat(maxNameLen - seq.name.length() + 1) + segment);
}
writer.println();
pos += 50;
}
}
}
public static void main(String[] args) throws IOException {
MSFHandler msf = new MSFHandler("example.msf");
msf.printProperties();
msf.write("new.msf");
}
}
- JavaScript Class for .MSF File Handling
Here's a JavaScript class (node.js style) that can open, decode, read, write, and print the properties to console. Requires fs module.
const fs = require('fs');
class MSFHandler {
constructor(filepath = null) {
this.filepath = filepath;
this.alignmentLength = null;
this.type = null;
this.globalChecksum = null;
this.sequences = []; // array of objects: {name, len, check, weight, sequence}
if (filepath) this.read(filepath);
}
parseHeader(lines) {
let inHeader = true;
let i = 0;
while (inHeader && i < lines.length) {
let line = lines[i].trim();
if (line.endsWith('..')) {
let parts = line.split(/\s+/);
for (let part of parts) {
if (part.startsWith('MSF:')) this.alignmentLength = parseInt(part.split(':')[1]);
if (part.startsWith('Type:')) this.type = part.split(':')[1];
if (part.startsWith('Check:')) this.globalChecksum = parseInt(part.split(':')[1]);
}
} else if (line.startsWith('Name:')) {
let parts = line.split(/\s+/);
let seq = {};
for (let j = 0; j < parts.length; j++) {
if (parts[j] === 'Name:') seq.name = parts[++j];
if (parts[j] === 'Len:') seq.len = parseInt(parts[++j]);
if (parts[j] === 'Check:') seq.check = parseInt(parts[++j]);
if (parts[j] === 'Weight:') seq.weight = parseFloat(parts[++j]);
}
seq.sequence = '';
this.sequences.push(seq);
} else if (line.startsWith('//')) {
inHeader = false;
}
i++;
}
return lines.slice(i);
}
parseAlignment(lines) {
for (let line of lines) {
line = line.trim();
if (line) {
let parts = line.split(/\s+/);
let name = parts[0];
let seqSegment = parts.slice(1).join('').replace(/\./g, '-');
for (let seq of this.sequences) {
if (seq.name === name) {
seq.sequence += seqSegment;
break;
}
}
}
}
}
read(filepath) {
this.filepath = filepath;
const content = fs.readFileSync(filepath, 'utf8');
const lines = content.split(/\r?\n/);
const remainingLines = this.parseHeader(lines);
this.parseAlignment(remainingLines.filter(l => l.trim() && !l.startsWith('//')));
}
printProperties() {
console.log(`Alignment Length: ${this.alignmentLength}`);
console.log(`Type: ${this.type}`);
console.log(`Global Checksum: ${this.globalChecksum}`);
this.sequences.forEach(seq => {
console.log(`Name: ${seq.name}, Length: ${seq.len}, Checksum: ${seq.check}, Weight: ${seq.weight}`);
});
}
write(newFilepath = null) {
if (!newFilepath) newFilepath = this.filepath || 'output.msf';
let output = `MSF: ${this.alignmentLength} Type: ${this.type} Check: ${this.globalChecksum} ..\n`;
this.sequences.forEach(seq => {
output += `Name: ${seq.name} Len: ${seq.len} Check: ${seq.check} Weight: ${seq.weight}\n`;
});
output += '//\n';
const maxNameLen = Math.max(...this.sequences.map(s => s.name.length));
let pos = 0;
while (pos < this.alignmentLength) {
this.sequences.forEach(seq => {
let segment = seq.sequence.slice(pos, pos + 50);
output += `${seq.name.padEnd(maxNameLen + 1)}${segment}\n`;
});
output += '\n';
pos += 50;
}
fs.writeFileSync(newFilepath, output);
}
}
// Example usage:
// const msf = new MSFHandler('example.msf');
// msf.printProperties();
// msf.write('new.msf');
- C++ Class for .MSF File Handling
Here's a C++ class that can open, decode, read, write, and print the properties to console.
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
struct Sequence {
std::string name;
int len;
int check;
double weight;
std::string sequence;
};
class MSFHandler {
private:
std::string filepath;
int alignment_length;
std::string type;
int global_checksum;
std::vector<Sequence> sequences;
public:
MSFHandler(const std::string& fp = "") : filepath(fp) {
if (!fp.empty()) read(fp);
}
void parse_header(std::ifstream& file) {
std::string line;
while (std::getline(file, line)) {
std::istringstream iss(line);
std::string token;
if (line.find("..") != std::string::npos) {
while (iss >> token) {
if (token.find("MSF:") == 0) alignment_length = std::stoi(token.substr(4));
if (token.find("Type:") == 0) type = token.substr(5);
if (token.find("Check:") == 0) global_checksum = std::stoi(token.substr(6));
}
} else if (line.find("Name:") != std::string::npos) {
Sequence seq;
while (iss >> token) {
if (token == "Name:") iss >> seq.name;
if (token == "Len:") iss >> seq.len;
if (token == "Check:") iss >> seq.check;
if (token == "Weight:") iss >> seq.weight;
}
seq.sequence = "";
sequences.push_back(seq);
} else if (line.find("//") != std::string::npos) {
break;
}
}
}
void parse_alignment(std::ifstream& file) {
std::string line;
while (std::getline(file, line)) {
if (line.empty()) continue;
std::istringstream iss(line);
std::string name, segment;
iss >> name;
while (iss >> segment) {
for (char& c : segment) if (c == '.') c = '-';
}
std::string full_segment;
iss.clear();
iss.seekg(0);
iss >> name;
std::getline(iss, full_segment);
full_segment.erase(std::remove(full_segment.begin(), full_segment.end(), ' '), full_segment.end());
for (auto& seq : sequences) {
if (seq.name == name) {
seq.sequence += full_segment;
break;
}
}
}
}
void read(const std::string& fp) {
filepath = fp;
std::ifstream file(fp);
if (!file) {
std::cerr << "Error opening file: " << fp << std::endl;
return;
}
parse_header(file);
parse_alignment(file);
file.close();
}
void print_properties() const {
std::cout << "Alignment Length: " << alignment_length << std::endl;
std::cout << "Type: " << type << std::endl;
std::cout << "Global Checksum: " << global_checksum << std::endl;
for (const auto& seq : sequences) {
std::cout << "Name: " << seq.name << ", Length: " << seq.len << ", Checksum: " << seq.check << ", Weight: " << seq.weight << std::endl;
}
}
void write(const std::string& new_fp = "") const {
std::string out_fp = new_fp.empty() ? (filepath.empty() ? "output.msf" : filepath) : new_fp;
std::ofstream file(out_fp);
if (!file) {
std::cerr << "Error writing file: " << out_fp << std::endl;
return;
}
file << "MSF: " << alignment_length << " Type: " << type << " Check: " << global_checksum << " ..\n";
for (const auto& seq : sequences) {
file << "Name: " << seq.name << " Len: " << seq.len << " Check: " << seq.check << " Weight: " << std::fixed << std::setprecision(2) << seq.weight << "\n";
}
file << "//\n";
size_t max_name_len = 0;
for (const auto& seq : sequences) max_name_len = std::max(max_name_len, seq.name.length());
size_t pos = 0;
while (pos < static_cast<size_t>(alignment_length)) {
for (const auto& seq : sequences) {
std::string segment = seq.sequence.substr(pos, 50);
file << std::left << std::setw(max_name_len + 1) << seq.name << segment << "\n";
}
file << "\n";
pos += 50;
}
file.close();
}
};
// Example usage:
// int main() {
// MSFHandler msf("example.msf");
// msf.print_properties();
// msf.write("new.msf");
// return 0;
// }