Task 250: .GBK File Format
Task 250: .GBK File Format
- The .GBK file format refers to the GenBank flat file format, which is a standard text-based format for storing annotated nucleotide or protein sequences, maintained by the National Center for Biotechnology Information (NCBI). It consists of an annotation section followed by a sequence section, terminated by "//". Based on the specifications from NCBI documentation, the intrinsic properties (main sections and their typical subfields) of this format are as follows:
LOCUS: Provides summary information about the sequence.
- Locus name (unique identifier).
- Sequence length (in base pairs or amino acids).
- Molecule type (e.g., DNA, RNA).
- Topology (linear or circular).
- GenBank division (e.g., PLN for plant sequences).
- Modification date.
DEFINITION: A concise description of the sequence, including organism, gene name, and function.
ACCESSION: The primary unique identifier for the sequence record (stable across updates).
VERSION: The sequence version in accession.version format, incremented upon sequence changes; may include GI (GenInfo Identifier).
KEYWORDS: Descriptive words or phrases (historical; often a period if none are present).
SOURCE: Free-text information about the source, typically the organism name.
ORGANISM: Formal scientific name and taxonomic lineage of the source organism.
REFERENCE: Citations for publications related to the sequence (multiple possible).
- AUTHORS: List of authors.
- TITLE: Publication title.
- JOURNAL: Journal name or submission details.
- PUBMED: PubMed identifier (if applicable).
COMMENT: Optional free-text remarks or additional information.
FEATURES: Table of biological features and annotations.
- Feature keys (e.g., source, gene, CDS, mRNA).
- Locations (base spans, e.g., 1..100 or complement(200..300)).
- Qualifiers (e.g., /organism, /product, /translation, /note).
BASE COUNT (or CONTIG for assemblies): Summary of base composition (e.g., number of A, C, G, T) or contig description.
ORIGIN: Marks the start of the sequence data (may include a pointer).
Sequence Data: Numbered lines of sequence (60 characters per line, grouped in blocks of 10).
//: Terminator indicating the end of the record.
Two direct download links for .GBK files (these URLs provide the GenBank flat file content directly, which can be saved as .gbk):
- https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=U49845&report=genbank (Sample Saccharomyces cerevisiae TCP1-beta gene).
- https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=NC_000913&report=genbank (Escherichia coli K-12 substr. MG1655 complete genome).
Below is an HTML page with embedded JavaScript suitable for embedding in a Ghost blog (or any static site). It allows users to drag and drop a .GBK file, parses the file into sections based on the properties listed in point 1, and displays them on the screen.
Drag and Drop .GBK File Parser
- Below is a Python class for handling .GBK files. It can open a file, parse (decode/read) the sections, print them to the console, and write modified sections back to a file.
class GBKHandler:
def __init__(self):
self.sections = {}
def read(self, filepath):
with open(filepath, 'r') as f:
content = f.read()
lines = content.split('\n')
current_section = None
for line in lines:
line = line.rstrip()
if line.startswith('//'):
break
match = re.match(r'^([A-Z]+)\s*(.*)$', line)
if match and not line.startswith(' '):
current_section = match.group(1)
self.sections[current_section] = self.sections.get(current_section, '') + match.group(2) + '\n'
elif current_section:
self.sections[current_section] += line + '\n'
def print_properties(self):
for key, value in self.sections.items():
print(f"{key}:\n{value.strip()}\n")
def write(self, filepath):
with open(filepath, 'w') as f:
for key, value in self.sections.items():
f.write(f"{key} {value}")
f.write('//\n')
# Example usage:
# handler = GBKHandler()
# handler.read('example.gbk')
# handler.print_properties()
# handler.sections['DEFINITION'] = 'Modified definition\n' # Modify example
# handler.write('modified.gbk')
Note: This requires import re
for pattern matching. For more advanced parsing (e.g., using Biopython), one could integrate from Bio import SeqIO
, but the above implements basic functionality from scratch.
- Below is a Java class for handling .GBK files. It can open a file, parse (decode/read) the sections, print them to the console, and write modified sections back to a file.
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class GBKHandler {
private Map<String, StringBuilder> sections = new LinkedHashMap<>();
public void read(String filepath) throws IOException {
try (BufferedReader reader = new BufferedReader(new FileReader(filepath))) {
String line;
String currentSection = null;
Pattern pattern = Pattern.compile("^([A-Z]+)\\s*(.*)$");
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.startsWith("//")) {
break;
}
Matcher matcher = pattern.matcher(line);
if (matcher.matches() && !line.startsWith(" ")) {
currentSection = matcher.group(1);
sections.putIfAbsent(currentSection, new StringBuilder());
sections.get(currentSection).append(matcher.group(2)).append("\n");
} else if (currentSection != null) {
sections.get(currentSection).append(line).append("\n");
}
}
}
}
public void printProperties() {
for (Map.Entry<String, StringBuilder> entry : sections.entrySet()) {
System.out.println(entry.getKey() + ":\n" + entry.getValue().toString().trim() + "\n");
}
}
public void write(String filepath) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filepath))) {
for (Map.Entry<String, StringBuilder> entry : sections.entrySet()) {
writer.write(entry.getKey() + " " + entry.getValue().toString());
}
writer.write("//\n");
}
}
// Example usage:
// public static void main(String[] args) throws IOException {
// GBKHandler handler = new GBKHandler();
// handler.read("example.gbk");
// handler.printProperties();
// handler.sections.get("DEFINITION").setLength(0); // Clear and modify
// handler.sections.get("DEFINITION").append("Modified definition\n");
// handler.write("modified.gbk");
// }
}
- Below is a JavaScript class for handling .GBK files (suitable for Node.js). It can open a file, parse (decode/read) the sections, print them to the console, and write modified sections back to a file. Requires Node.js with
fs
module.
const fs = require('fs');
class GBKHandler {
constructor() {
this.sections = {};
}
read(filepath) {
const content = fs.readFileSync(filepath, 'utf8');
const lines = content.split('\n');
let currentSection = null;
for (let line of lines) {
line = line.trimEnd();
if (line.startsWith('//')) {
break;
}
const match = line.match(/^([A-Z]+)\s*(.*)$/);
if (match && !line.startsWith(' ')) {
currentSection = match[1];
if (!this.sections[currentSection]) this.sections[currentSection] = '';
this.sections[currentSection] += match[2] + '\n';
} else if (currentSection) {
this.sections[currentSection] += line + '\n';
}
}
}
printProperties() {
for (let key in this.sections) {
console.log(`${key}:\n${this.sections[key].trim()}\n`);
}
}
write(filepath) {
let output = '';
for (let key in this.sections) {
output += `${key} ${this.sections[key]}`;
}
output += '//\n';
fs.writeFileSync(filepath, output);
}
}
// Example usage:
// const handler = new GBKHandler();
// handler.read('example.gbk');
// handler.printProperties();
// handler.sections['DEFINITION'] = 'Modified definition\n';
// handler.write('modified.gbk');
- Below is a C++ class for handling .GBK files. It can open a file, parse (decode/read) the sections, print them to the console, and write modified sections back to a file.
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <regex>
class GBKHandler {
private:
std::map<std::string, std::string> sections;
public:
void read(const std::string& filepath) {
std::ifstream file(filepath);
if (!file.is_open()) return;
std::string line;
std::string currentSection;
std::regex pattern(R"(^([A-Z]+)\s*(.*)$)");
while (std::getline(file, line)) {
std::string trimmed = line;
trimmed.erase(trimmed.find_last_not_of(" \n\r\t") + 1);
if (trimmed.rfind("//", 0) == 0) {
break;
}
std::smatch match;
if (std::regex_match(line, match, pattern) && line[0] != ' ') {
currentSection = match[1];
sections[currentSection] += match[2].str() + "\n";
} else if (!currentSection.empty()) {
sections[currentSection] += line + "\n";
}
}
file.close();
}
void printProperties() const {
for (const auto& pair : sections) {
std::string value = pair.second;
value.erase(value.find_last_not_of(" \n\r\t") + 1);
std::cout << pair.first << ":\n" << value << "\n\n";
}
}
void write(const std::string& filepath) const {
std::ofstream file(filepath);
if (!file.is_open()) return;
for (const auto& pair : sections) {
file << pair.first << " " << pair.second;
}
file << "//\n";
file.close();
}
};
// Example usage:
// int main() {
// GBKHandler handler;
// handler.read("example.gbk");
// handler.printProperties();
// handler.sections["DEFINITION"] = "Modified definition\n"; // Note: Direct map access for modification
// handler.write("modified.gbk");
// return 0;
// }