Task 091: .CLS File Format
Task 091: .CLS File Format
1. Properties of the .CLS File Format Intrinsic to Its Structure
The .CLS file format, as used in bioinformatics tools such as GenePattern and GSEA (Gene Set Enrichment Analysis), is a plain text format designed to define phenotype labels for samples in gene expression datasets. It supports both categorical and continuous phenotypes, but the core structure is line-based and delimited by spaces or tabs. The intrinsic properties are derived directly from the file's structural elements, which encode metadata about samples and classes. These properties are:
- Number of samples: An integer indicating the total count of samples (corresponding to rows in associated data files like .gct or .res). This is specified in the first line for categorical files.
- Number of classes: An integer indicating the number of distinct phenotype classes (for categorical files only). This is specified in the first line.
- File type: A categorical indicator (e.g., presence of a trailing "1" in the first line) or continuous (indicated by "#numeric" as the first line). This determines the interpretation of subsequent data.
- Class names: A list of strings naming each class (for categorical files). These appear on the second line, prefixed by "# ".
- Phenotype labels: A list of values assigned to each sample. For categorical files, this is a list of integers (starting from 0) on the third line, mapping each sample to a class index. For continuous files, this consists of one or more profiles, each with a name (prefixed by "# ") followed by a list of floating-point values (one per sample, potentially spanning multiple lines).
These properties form the foundational structure of the format, ensuring compatibility with associated expression data files. The format assumes UTF-8 encoding and space/tab delimiters, with no additional headers or footers.
2. Two Direct Download Links for .CLS Files
- https://datasets.genepattern.org/data/all_aml/all_aml_train.cls (Training class vector from the ALL-AML dataset, categorical format with 38 samples and 2 classes).
- https://datasets.genepattern.org/data/all_aml/all_aml_test.cls (Test class vector from the ALL-AML dataset, categorical format with 34 samples and 2 classes).
3. Ghost Blog-Embedded HTML JavaScript for Drag-and-Drop .CLS File Parsing
The following is a self-contained HTML snippet with embedded JavaScript, suitable for embedding in a Ghost blog post (e.g., via the HTML card or custom code block). It provides a drag-and-drop interface for uploading a .CLS file, parses it assuming a categorical format (as the primary use case), extracts the properties, and displays them on screen. Continuous formats are noted but not fully parsed for brevity. Place this in a Ghost post's HTML block.
This code uses the File API for drag-and-drop and parses the file client-side. It outputs the properties in a readable format below the drop zone.
4. Python Class for .CLS File Handling
The following Python class handles opening, reading, writing, and printing properties for a .CLS file, assuming categorical format. It uses built-in file I/O and string parsing.
import os
class CLSFile:
def __init__(self, filename=None):
self.filename = filename
self.properties = {
'file_type': None,
'num_samples': None,
'num_classes': None,
'class_names': [],
'phenotype_labels': []
}
if filename:
self.open(filename)
def open(self, filename):
"""Open and parse the .CLS file."""
if not os.path.exists(filename):
raise FileNotFoundError(f"File {filename} not found.")
with open(filename, 'r') as f:
self._parse(f.read())
self.filename = filename
def _parse(self, content):
"""Internal parser for content."""
lines = [line.strip() for line in content.strip().split('\n') if line.strip()]
if len(lines) < 3:
raise ValueError('Invalid CLS file: insufficient lines.')
first_line = lines[0].split()
if len(first_line) < 3 or first_line[2] != '1':
self.properties['file_type'] = 'continuous or invalid'
self.properties['note'] = 'Parser optimized for categorical.'
return
self.properties['file_type'] = 'categorical'
self.properties['num_samples'] = int(first_line[0])
self.properties['num_classes'] = int(first_line[1])
second_line = lines[1]
if not second_line.startswith('# '):
raise ValueError('Invalid second line: expected # followed by class names.')
self.properties['class_names'] = [name for name in second_line[2:].split() if name]
third_line = lines[2].split()
if len(third_line) != self.properties['num_samples']:
raise ValueError('Mismatch in number of sample labels.')
self.properties['phenotype_labels'] = [int(label) for label in third_line]
if len(self.properties['class_names']) != self.properties['num_classes']:
raise ValueError('Mismatch in number of class names.')
def read(self):
"""Read properties from file (calls open if needed)."""
if not self.filename:
raise ValueError('No filename provided. Call open() first.')
self.open(self.filename)
return self.properties
def write(self, filename=None, properties=None):
"""Write properties to a new or existing file."""
if filename is None:
filename = self.filename
if not filename:
raise ValueError('Filename required for write.')
if properties is None:
properties = self.properties
if properties['file_type'] != 'categorical':
raise ValueError('Write supports categorical only.')
with open(filename, 'w') as f:
f.write(f"{properties['num_samples']} {properties['num_classes']} 1\n")
f.write(f"# {' '.join(properties['class_names'])}\n")
f.write(' '.join(str(label) for label in properties['phenotype_labels']) + '\n')
self.filename = filename
def print_properties(self):
"""Print all properties to console."""
props = self.properties
print(f"File Type: {props['file_type']}")
if props['file_type'] == 'categorical':
print(f"Number of Samples: {props['num_samples']}")
print(f"Number of Classes: {props['num_classes']}")
print(f"Class Names: {', '.join(props['class_names'])}")
print(f"Phenotype Labels: [{', '.join(str(l) for l in props['phenotype_labels'])}]")
else:
print(f"Note: {props.get('note', 'Continuous format detected.')}")
Usage example: cls = CLSFile('example.cls'); cls.read(); cls.print_properties(); cls.write('output.cls')
.
5. Java Class for .CLS File Handling
The following Java class uses BufferedReader
for reading and PrintWriter
for writing. It assumes categorical format and requires Java 8+. Compile with javac CLSFile.java
and run with java CLSFile <filename>
(adapt for full usage).
import java.io.*;
import java.util.*;
public class CLSFile {
private String filename;
private Map<String, Object> properties = new HashMap<>();
public CLSFile(String filename) {
this.filename = filename;
properties.put("file_type", null);
properties.put("num_samples", null);
properties.put("num_classes", null);
properties.put("class_names", new ArrayList<String>());
properties.put("phenotype_labels", new ArrayList<Integer>());
if (filename != null) {
open(filename);
}
}
public void open(String filename) {
this.filename = filename;
try (BufferedReader reader = new BufferedReader(new FileReader(filename))) {
parse(reader.lines().collect(Collectors.joining("\n")));
} catch (IOException e) {
throw new RuntimeException("File not found: " + filename, e);
}
}
private void parse(String content) {
String[] lines = content.split("\n");
lines = Arrays.stream(lines).map(String::trim).filter(s -> !s.isEmpty()).toArray(String[]::new);
if (lines.length < 3) {
throw new RuntimeException("Invalid CLS file: insufficient lines.");
}
String[] firstLine = lines[0].split("\\s+");
if (firstLine.length < 3 || !firstLine[2].equals("1")) {
properties.put("file_type", "continuous or invalid");
properties.put("note", "Parser optimized for categorical.");
return;
}
properties.put("file_type", "categorical");
properties.put("num_samples", Integer.parseInt(firstLine[0]));
properties.put("num_classes", Integer.parseInt(firstLine[1]));
String secondLine = lines[1];
if (!secondLine.startsWith("# ")) {
throw new RuntimeException("Invalid second line: expected # followed by class names.");
}
List<String> classNames = Arrays.stream(secondLine.substring(2).split("\\s+"))
.filter(name -> !name.isEmpty()).collect(Collectors.toList());
properties.put("class_names", classNames);
String[] thirdLine = lines[2].split("\\s+");
if (thirdLine.length != (int) properties.get("num_samples")) {
throw new RuntimeException("Mismatch in number of sample labels.");
}
List<Integer> labels = new ArrayList<>();
for (String label : thirdLine) {
if (!label.isEmpty()) {
labels.add(Integer.parseInt(label));
}
}
properties.put("phenotype_labels", labels);
if (classNames.size() != (int) properties.get("num_classes")) {
throw new RuntimeException("Mismatch in number of class names.");
}
}
public Map<String, Object> read() {
if (filename == null) {
throw new RuntimeException("No filename provided. Call open() first.");
}
open(filename);
return properties;
}
public void write(String filename, Map<String, Object> props) {
if (filename == null) {
filename = this.filename;
}
if (filename == null) {
throw new RuntimeException("Filename required for write.");
}
if (!"categorical".equals(props.get("file_type"))) {
throw new RuntimeException("Write supports categorical only.");
}
try (PrintWriter writer = new PrintWriter(new FileWriter(filename))) {
writer.println(props.get("num_samples") + " " + props.get("num_classes") + " 1");
writer.println("# " + String.join(" ", (List<String>) props.get("class_names")));
writer.println(String.join(" ", ((List<Integer>) props.get("phenotype_labels")).stream()
.map(String::valueOf).collect(Collectors.toList())));
} catch (IOException e) {
throw new RuntimeException("Write failed: " + e.getMessage(), e);
}
this.filename = filename;
}
public void printProperties() {
Map<String, Object> props = properties;
System.out.println("File Type: " + props.get("file_type"));
if ("categorical".equals(props.get("file_type"))) {
System.out.println("Number of Samples: " + props.get("num_samples"));
System.out.println("Number of Classes: " + props.get("num_classes"));
System.out.println("Class Names: " + String.join(", ", (List<String>) props.get("class_names")));
System.out.println("Phenotype Labels: [" + ((List<Integer>) props.get("phenotype_labels")).stream()
.map(String::valueOf).collect(Collectors.joining(", ")) + "]");
} else {
System.out.println("Note: " + props.getOrDefault("note", "Continuous format detected."));
}
}
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Usage: java CLSFile <filename>");
return;
}
CLSFile cls = new CLSFile(args[0]);
cls.read();
cls.printProperties();
}
}
6. JavaScript Class for .CLS File Handling (Node.js)
This Node.js class uses the fs
module for file operations. Run with node clsfile.js <filename>
. It assumes categorical format.
const fs = require('fs');
class CLSFile {
constructor(filename = null) {
this.filename = filename;
this.properties = {
file_type: null,
num_samples: null,
num_classes: null,
class_names: [],
phenotype_labels: []
};
if (filename) {
this.open(filename);
}
}
open(filename) {
if (!fs.existsSync(filename)) {
throw new Error(`File ${filename} not found.`);
}
const content = fs.readFileSync(filename, 'utf8');
this._parse(content);
this.filename = filename;
}
_parse(content) {
const lines = content.trim().split('\n').map(line => line.trim()).filter(line => line.length > 0);
if (lines.length < 3) {
throw new Error('Invalid CLS file: insufficient lines.');
}
const firstLine = lines[0].split(/\s+/);
if (firstLine.length < 3 || firstLine[2] !== '1') {
this.properties.file_type = 'continuous or invalid';
this.properties.note = 'Parser optimized for categorical.';
return;
}
this.properties.file_type = 'categorical';
this.properties.num_samples = parseInt(firstLine[0], 10);
this.properties.num_classes = parseInt(firstLine[1], 10);
const secondLine = lines[1];
if (!secondLine.startsWith('# ')) {
throw new Error('Invalid second line: expected # followed by class names.');
}
this.properties.class_names = secondLine.substring(2).split(/\s+/).filter(name => name.length > 0);
const thirdLine = lines[2].split(/\s+/).filter(label => label.length > 0);
if (thirdLine.length !== this.properties.num_samples) {
throw new Error('Mismatch in number of sample labels.');
}
this.properties.phenotype_labels = thirdLine.map(label => parseInt(label, 10));
if (this.properties.class_names.length !== this.properties.num_classes) {
throw new Error('Mismatch in number of class names.');
}
}
read() {
if (!this.filename) {
throw new Error('No filename provided. Call open() first.');
}
this.open(this.filename);
return this.properties;
}
write(filename = null, properties = null) {
if (filename === null) {
filename = this.filename;
}
if (!filename) {
throw new Error('Filename required for write.');
}
if (properties === null) {
properties = this.properties;
}
if (properties.file_type !== 'categorical') {
throw new Error('Write supports categorical only.');
}
const content = `${properties.num_samples} ${properties.num_classes} 1\n# ${properties.class_names.join(' ')}\n${properties.phenotype_labels.join(' ')}\n`;
fs.writeFileSync(filename, content);
this.filename = filename;
}
printProperties() {
const props = this.properties;
console.log(`File Type: ${props.file_type}`);
if (props.file_type === 'categorical') {
console.log(`Number of Samples: ${props.num_samples}`);
console.log(`Number of Classes: ${props.num_classes}`);
console.log(`Class Names: ${props.class_names.join(', ')}`);
console.log(`Phenotype Labels: [${props.phenotype_labels.join(', ')}]`);
} else {
console.log(`Note: ${props.note || 'Continuous format detected.'}`);
}
}
}
if (require.main === module) {
const args = process.argv.slice(2);
if (args.length === 0) {
console.log('Usage: node clsfile.js <filename>');
process.exit(1);
}
const cls = new CLSFile(args[0]);
cls.read();
cls.printProperties();
}
module.exports = CLSFile;
7. C Implementation for .CLS File Handling
C lacks classes, so this is implemented as a struct with functions for opening, reading, writing, and printing. It uses standard I/O (stdio.h
). Compile with gcc clsfile.c -o clsfile
and run ./clsfile <filename>
. Assumes categorical format.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_LINE 1024
#define MAX_CLASSES 100
#define MAX_SAMPLES 10000
typedef struct {
char filename[256];
char file_type[32];
int num_samples;
int num_classes;
char class_names[MAX_CLASSES][256];
int num_class_names;
int phenotype_labels[MAX_SAMPLES];
int num_labels;
} CLSProperties;
void init_properties(CLSProperties *props) {
strcpy(props->file_type, "");
props->num_samples = 0;
props->num_classes = 0;
props->num_class_names = 0;
props->num_labels = 0;
strcpy(props->filename, "");
}
int parse_file(const char *filename, CLSProperties *props) {
FILE *file = fopen(filename, "r");
if (!file) {
perror("File not found");
return 0;
}
char line[MAX_LINE];
char *lines[3];
int line_count = 0;
// Read first three lines
while (line_count < 3 && fgets(line, sizeof(line), file)) {
line[strcspn(line, "\n")] = 0; // Remove newline
if (strlen(line) > 0) {
lines[line_count] = strdup(line);
line_count++;
}
}
fclose(file);
if (line_count < 3) {
fprintf(stderr, "Invalid CLS file: insufficient lines.\n");
return 0;
}
// Parse first line
int parts[3];
if (sscanf(lines[0], "%d %d %d", &parts[0], &parts[1], &parts[2]) != 3 || parts[2] != 1) {
strcpy(props->file_type, "continuous or invalid");
strcpy(props->filename, filename);
return 1; // Partial success
}
strcpy(props->file_type, "categorical");
props->num_samples = parts[0];
props->num_classes = parts[1];
// Parse second line
if (strncmp(lines[1], "# ", 2) != 0) {
fprintf(stderr, "Invalid second line.\n");
return 0;
}
char *name_start = lines[1] + 2;
props->num_class_names = 0;
char *token = strtok(name_start, " \t");
while (token && props->num_class_names < MAX_CLASSES) {
strncpy(props->class_names[props->num_class_names], token, 255);
props->class_names[props->num_class_names][255] = 0;
props->num_class_names++;
token = strtok(NULL, " \t");
}
// Parse third line
props->num_labels = 0;
char *label_start = lines[1]; // Reuse for simplicity; in full impl, read properly
token = strtok(lines[2], " \t");
while (token && props->num_labels < MAX_SAMPLES) {
props->phenotype_labels[props->num_labels] = atoi(token);
props->num_labels++;
token = strtok(NULL, " \t");
}
if (props->num_labels != props->num_samples || props->num_class_names != props->num_classes) {
fprintf(stderr, "Mismatch in counts.\n");
return 0;
}
strcpy(props->filename, filename);
return 1;
}
void print_properties(const CLSProperties *props) {
printf("File Type: %s\n", props->file_type);
if (strcmp(props->file_type, "categorical") == 0) {
printf("Number of Samples: %d\n", props->num_samples);
printf("Number of Classes: %d\n", props->num_classes);
printf("Class Names: ");
for (int i = 0; i < props->num_class_names; i++) {
printf("%s", props->class_names[i]);
if (i < props->num_class_names - 1) printf(", ");
}
printf("\n");
printf("Phenotype Labels: [");
for (int i = 0; i < props->num_labels; i++) {
printf("%d", props->phenotype_labels[i]);
if (i < props->num_labels - 1) printf(", ");
}
printf("]\n");
} else {
printf("Note: Continuous format detected.\n");
}
}
int write_file(const char *filename, const CLSProperties *props) {
if (strcmp(props->file_type, "categorical") != 0) {
fprintf(stderr, "Write supports categorical only.\n");
return 0;
}
FILE *file = fopen(filename, "w");
if (!file) {
perror("Write failed");
return 0;
}
fprintf(file, "%d %d 1\n", props->num_samples, props->num_classes);
fprintf(file, "# ");
for (int i = 0; i < props->num_class_names; i++) {
fprintf(file, "%s", props->class_names[i]);
if (i < props->num_class_names - 1) fprintf(file, " ");
}
fprintf(file, "\n");
for (int i = 0; i < props->num_labels; i++) {
fprintf(file, "%d", props->phenotype_labels[i]);
if (i < props->num_labels - 1) fprintf(file, " ");
}
fprintf(file, "\n");
fclose(file);
return 1;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
return 1;
}
CLSProperties props;
init_properties(&props);
if (!parse_file(argv[1], &props)) {
fprintf(stderr, "Parse failed.\n");
return 1;
}
print_properties(&props);
// Example write to output.cls
write_file("output.cls", &props);
return 0;
}
This implementation includes basic error handling and assumes reasonable limits on samples/classes. For production, expand buffer sizes and memory management.