Task 358: .LM File Format
Task 358: .LM File Format
.LM File Format Specifications
The .LM file format refers to the ARPA language model format used in speech recognition systems like CMUSphinx. It is a text-based format for storing statistical n-gram language models, containing probabilities and backoff weights for word sequences. The format is self-delimiting and consists of a header with n-gram counts, followed by sections for each n-gram order, and ends with a marker.
1. List of All Properties Intrinsic to the File Format
- Header Section (\data): Starts with the line "\data" followed by lines specifying the number of n-grams for each order, e.g., "ngram 1=7" (number of unigrams), "ngram 2=7" (number of bigrams), up to the maximum order.
- Maximum n-gram Order: The highest n in the ngram counts, determining the model's complexity (e.g., 3 for a trigram model).
- Vocabulary: All unique words listed in the unigrams section, including special tokens like "
" (sentence start), "" (sentence end), and "" (unknown word). - n-gram Sections (\n-grams:): For each order n from 1 to max, a section starting with "\n-grams:" containing lines for each n-gram.
- n-gram Entries: Each line in an n-gram section includes:
- Log10 probability (negative floating-point number, e.g., -1.0000).
- Word sequence (space-separated words, e.g., "
wood" for a bigram). Optional backoff weight (floating-point number, e.g., -0.2553; present for n-grams where n < max order, used for smoothing unseen sequences).- End Marker (\end): The file ends with the line "\end".
- Probabilities: Log base 10 values (negative, as probabilities are between 0 and 1).
- Backoff Mechanism: Implicit property where missing n-grams fall back to lower orders using backoff weights (replaced by 1.0 if no backoff listed).
- Special Tokens: "
", "", "" are required for sentence boundaries and handling out-of-vocabulary words.
2. Two Direct Download Links for .LM Files
- https://sourceforge.net/projects/cmusphinx/files/Acoustic and Language Models/German/cmusphinx-voxforge-de.lm.gz/download (German language model in ARPA format, compressed with gzip)
- https://sourceforge.net/projects/cmusphinx/files/Acoustic and Language Models/Italian/cmusphinx-voxforge-it.lm.tar.gz/download (Italian language model in ARPA format, compressed in tar.gz)
3. Ghost Blog Embedded HTML JavaScript for Drag-and-Drop .LM File Dump
Drag and drop .LM file here
4. Python Class for .LM File Handling
import sys
class LMFile:
def __init__(self):
self.header = ''
self.max_order = 0
self.ngram_counts = {}
self.vocabulary = set()
self.ngrams = {}
self.special_tokens = ['<s>', '</s>', '<unk>']
self.end_marker = '\\end\\'
def open_and_read(self, filepath):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
lines = [line.strip() for line in content.split('\n') if line.strip()]
section = None
for line in lines:
if line == '\\data\\':
section = 'header'
self.header = line
elif line.startswith('\\') and line.endswith('-grams:'):
order = int(line[1:-7])
self.max_order = max(self.max_order, order)
section = order
self.ngrams[section] = []
elif line == '\\end\\':
self.end_marker = line
section = None
elif section == 'header' and line.startswith('ngram '):
parts = line.split('=')
order = int(parts[0].split()[1])
count = int(parts[1])
self.ngram_counts[order] = count
elif isinstance(section, int) and line:
parts = line.split()
log_prob = float(parts[0])
words = parts[1:-1] if len(parts) > 2 and not parts[-1].startswith('-') and not parts[-1].startswith('0') else parts[1:]
backoff = float(parts[-1]) if len(parts) > len(words) + 1 else None
self.ngrams[section].append({'log_prob': log_prob, 'words': ' '.join(words), 'backoff': backoff})
if len(words) == 1:
self.vocabulary.add(words[0])
def print_properties(self):
print(f"Header: {self.header}")
print(f"Max n-gram Order: {self.max_order}")
print("n-gram Counts:")
for order, count in self.ngram_counts.items():
print(f" Order {order}: {count}")
print(f"Vocabulary: {', '.join(sorted(self.vocabulary))}")
print(f"Special Tokens: {', '.join(self.special_tokens)}")
print("n-grams:")
for order in sorted(self.ngrams):
print(f" Order {order}:")
for ngram in self.ngrams[order]:
print(f" Log Prob: {ngram['log_prob']}, Sequence: {ngram['words']}, Backoff: {ngram['backoff'] or 'N/A'}")
print(f"End Marker: {self.end_marker}")
def write(self, filepath):
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"{self.header}\n")
for order, count in sorted(self.ngram_counts.items()):
f.write(f"ngram {order}={count}\n")
f.write("\n")
for order in sorted(self.ngrams):
f.write(f"\\{order}-grams:\n")
for ngram in self.ngrams[order]:
line = f"{ngram['log_prob']}\t{ngram['words']}"
if ngram['backoff'] is not None:
line += f"\t{ngram['backoff']}"
f.write(f"{line}\n")
f.write("\n\\end\\\n")
# Example usage:
# lm = LMFile()
# lm.open_and_read('example.lm')
# lm.print_properties()
# lm.write('output.lm')
5. Java Class for .LM File Handling
import java.io.*;
import java.util.*;
public class LMFile {
private String header = "";
private int maxOrder = 0;
private Map<Integer, Integer> ngramCounts = new TreeMap<>();
private Set<String> vocabulary = new TreeSet<>();
private Map<Integer, List<Map<String, Object>>> ngrams = new TreeMap<>();
private String[] specialTokens = {"<s>", "</s>", "<unk>"};
private String endMarker = "\\end\\";
public void openAndRead(String filepath) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(filepath));
String line;
Integer section = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.isEmpty()) continue;
if (line.equals("\\data\\")) {
section = -1; // header
header = line;
} else if (line.startsWith("\\") && line.endsWith("-grams:")) {
section = Integer.parseInt(line.substring(1, line.indexOf('-')));
maxOrder = Math.max(maxOrder, section);
ngrams.put(section, new ArrayList<>());
} else if (line.equals("\\end\\")) {
endMarker = line;
section = null;
} else if (section != null && section == -1 && line.startsWith("ngram ")) {
String[] parts = line.split("[= ]");
int order = Integer.parseInt(parts[1]);
int count = Integer.parseInt(parts[3]);
ngramCounts.put(order, count);
} else if (section != null && section > 0 && !line.isEmpty()) {
String[] parts = line.split("\\s+");
double logProb = Double.parseDouble(parts[0]);
StringBuilder wordsBuilder = new StringBuilder();
for (int i = 1; i < parts.length - (parts.length > 2 && parts[parts.length - 1].matches("-?\\d+(\\.\\d+)?") ? 1 : 0); i++) {
wordsBuilder.append(parts[i]).append(" ");
}
String words = wordsBuilder.toString().trim();
Double backoff = (parts.length > words.split(" ").length + 1) ? Double.parseDouble(parts[parts.length - 1]) : null;
Map<String, Object> ngram = new HashMap<>();
ngram.put("logProb", logProb);
ngram.put("words", words);
ngram.put("backoff", backoff);
ngrams.get(section).add(ngram);
if (words.split(" ").length == 1) vocabulary.add(words);
}
}
reader.close();
}
public void printProperties() {
System.out.println("Header: " + header);
System.out.println("Max n-gram Order: " + maxOrder);
System.out.println("n-gram Counts:");
for (Map.Entry<Integer, Integer> entry : ngramCounts.entrySet()) {
System.out.println(" Order " + entry.getKey() + ": " + entry.getValue());
}
System.out.println("Vocabulary: " + String.join(", ", vocabulary));
System.out.println("Special Tokens: " + String.join(", ", specialTokens));
System.out.println("n-grams:");
for (Map.Entry<Integer, List<Map<String, Object>>> orderEntry : ngrams.entrySet()) {
System.out.println(" Order " + orderEntry.getKey() + ":");
for (Map<String, Object> ngram : orderEntry.getValue()) {
System.out.println(" Log Prob: " + ngram.get("logProb") + ", Sequence: " + ngram.get("words") + ", Backoff: " + (ngram.get("backoff") != null ? ngram.get("backoff") : "N/A"));
}
}
System.out.println("End Marker: " + endMarker);
}
public void write(String filepath) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(filepath));
writer.write(header + "\n");
for (Map.Entry<Integer, Integer> entry : ngramCounts.entrySet()) {
writer.write("ngram " + entry.getKey() + "=" + entry.getValue() + "\n");
}
writer.write("\n");
for (Map.Entry<Integer, List<Map<String, Object>>> orderEntry : ngrams.entrySet()) {
writer.write("\\" + orderEntry.getKey() + "-grams:\n");
for (Map<String, Object> ngram : orderEntry.getValue()) {
String line = ngram.get("logProb") + "\t" + ngram.get("words");
if (ngram.get("backoff") != null) line += "\t" + ngram.get("backoff");
writer.write(line + "\n");
}
writer.write("\n");
}
writer.write(endMarker + "\n");
writer.close();
}
// Example usage:
// public static void main(String[] args) throws IOException {
// LMFile lm = new LMFile();
// lm.openAndRead("example.lm");
// lm.printProperties();
// lm.write("output.lm");
// }
}
6. JavaScript Class for .LM File Handling
class LMFile {
constructor() {
this.header = '';
this.maxOrder = 0;
this.ngramCounts = {};
this.vocabulary = new Set();
this.ngrams = {};
this.specialTokens = ['<s>', '</s>', '<unk>'];
this.endMarker = '\\end\\';
}
openAndRead(content) { // Accepts file content as string
const lines = content.split('\n').map(line => line.trim()).filter(line => line);
let section = null;
lines.forEach(line => {
if (line === '\\data\\') {
section = 'header';
this.header = line;
} else if (line.startsWith('\\') && line.endsWith('-grams:')) {
const order = parseInt(line.match(/\\(\d+)-grams:/)[1]);
this.maxOrder = Math.max(this.maxOrder, order);
section = order;
this.ngrams[section] = [];
} else if (line === '\\end\\') {
this.endMarker = line;
section = null;
} else if (section === 'header' && line.startsWith('ngram ')) {
const [, order, count] = line.match(/ngram (\d+)=(\d+)/);
this.ngramCounts[parseInt(order)] = parseInt(count);
} else if (typeof section === 'number' && line) {
const parts = line.split(/\s+/);
const logProb = parseFloat(parts[0]);
const backoffIndex = parts.length - 1;
const hasBackoff = !isNaN(parseFloat(parts[backoffIndex])) && parts[backoffIndex].includes('.');
const words = parts.slice(1, hasBackoff ? backoffIndex : parts.length);
const backoff = hasBackoff ? parseFloat(parts[backoffIndex]) : null;
this.ngrams[section].push({ logProb, words: words.join(' '), backoff });
if (words.length === 1) this.vocabulary.add(words[0]);
}
});
}
printProperties() {
let output = `Header: ${this.header}\n`;
output += `Max n-gram Order: ${this.maxOrder}\n`;
output += 'n-gram Counts:\n';
Object.keys(this.ngramCounts).sort((a, b) => a - b).forEach(order => {
output += ` Order ${order}: ${this.ngramCounts[order]}\n`;
});
output += `Vocabulary: ${Array.from(this.vocabulary).sort().join(', ')}\n`;
output += `Special Tokens: ${this.specialTokens.join(', ')}\n`;
output += 'n-grams:\n';
Object.keys(this.ngrams).sort((a, b) => a - b).forEach(order => {
output += ` Order ${order}:\n`;
this.ngrams[order].forEach(ngram => {
output += ` Log Prob: ${ngram.logProb}, Sequence: ${ngram.words}, Backoff: ${ngram.backoff !== null ? ngram.backoff : 'N/A'}\n`;
});
});
output += `End Marker: ${this.endMarker}\n`;
console.log(output);
}
write() {
let content = `${this.header}\n`;
Object.keys(this.ngramCounts).sort((a, b) => a - b).forEach(order => {
content += `ngram ${order}=${this.ngramCounts[order]}\n`;
});
content += '\n';
Object.keys(this.ngrams).sort((a, b) => a - b).forEach(order => {
content += `\\${order}-grams:\n`;
this.ngrams[order].forEach(ngram => {
let line = `${ngram.logProb}\t${ngram.words}`;
if (ngram.backoff !== null) line += `\t${ngram.backoff}`;
content += `${line}\n`;
});
content += '\n';
});
content += `${this.endMarker}\n`;
return content; // Return string for writing to file (e.g., via fs in Node.js)
}
}
// Example usage in Node.js:
// const fs = require('fs');
// const lm = new LMFile();
// const content = fs.readFileSync('example.lm', 'utf8');
// lm.openAndRead(content);
// lm.printProperties();
// fs.writeFileSync('output.lm', lm.write());
7. C Class for .LM File Handling
Note: In C, we use structs instead of classes for simplicity. This implementation uses dynamic memory and assumes basic error handling.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_LINE 1024
#define MAX_WORDS 10000
typedef struct Ngram {
double log_prob;
char *words;
double backoff; // -1 if N/A
struct Ngram *next;
} Ngram;
typedef struct {
char *header;
int max_order;
int *ngram_counts; // array indexed by order-1
char **vocabulary;
int vocab_size;
char *special_tokens[3];
Ngram **ngrams; // array of lists, indexed by order-1
char *end_marker;
} LMFile;
LMFile *lm_create() {
LMFile *lm = malloc(sizeof(LMFile));
lm->header = strdup("\\data\\");
lm->max_order = 0;
lm->ngram_counts = NULL;
lm->vocabulary = NULL;
lm->vocab_size = 0;
lm->special_tokens[0] = strdup("<s>");
lm->special_tokens[1] = strdup("</s>");
lm->special_tokens[2] = strdup("<unk>");
lm->ngrams = NULL;
lm->end_marker = strdup("\\end\\");
return lm;
}
void lm_free(LMFile *lm) {
free(lm->header);
free(lm->ngram_counts);
for (int i = 0; i < lm->vocab_size; i++) free(lm->vocabulary[i]);
free(lm->vocabulary);
for (int i = 0; i < 3; i++) free(lm->special_tokens[i]);
for (int order = 0; order < lm->max_order; order++) {
Ngram *ng = lm->ngrams[order];
while (ng) {
Ngram *next = ng->next;
free(ng->words);
free(ng);
ng = next;
}
}
free(lm->ngrams);
free(lm->end_marker);
free(lm);
}
void lm_open_and_read(LMFile *lm, const char *filepath) {
FILE *file = fopen(filepath, "r");
if (!file) {
perror("Failed to open file");
return;
}
char line[MAX_LINE];
int section = -1; // -1: none, 0: header, >0: order
while (fgets(line, MAX_LINE, file)) {
char *trimmed = strtok(line, "\r\n");
if (!trimmed || strlen(trimmed) == 0) continue;
if (strcmp(trimmed, "\\data\\") == 0) {
section = 0;
lm->header = strdup(trimmed);
} else if (strstr(trimmed, "-grams:")) {
sscanf(trimmed, "\\%d-grams:", §ion);
lm->max_order = section > lm->max_order ? section : lm->max_order;
if (!lm->ngrams) lm->ngrams = calloc(lm->max_order, sizeof(Ngram*));
if (!lm->ngram_counts) lm->ngram_counts = calloc(lm->max_order, sizeof(int));
} else if (strcmp(trimmed, "\\end\\") == 0) {
lm->end_marker = strdup(trimmed);
section = -1;
} else if (section == 0 && strstr(trimmed, "ngram ")) {
int order, count;
sscanf(trimmed, "ngram %d=%d", &order, &count);
lm->ngram_counts[order - 1] = count;
} else if (section > 0) {
double log_prob, backoff = -1.0;
char words[MAX_LINE];
char *token = strtok(trimmed, " \t");
log_prob = atof(token);
char *word_start = trimmed + strlen(token) + 1;
char *backoff_str = strrchr(word_start, '\t');
if (backoff_str && isdigit(backoff_str[1]) || backoff_str[1] == '-') {
backoff = atof(backoff_str + 1);
*backoff_str = '\0';
}
strcpy(words, word_start);
Ngram *ng = malloc(sizeof(Ngram));
ng->log_prob = log_prob;
ng->words = strdup(words);
ng->backoff = backoff;
ng->next = lm->ngrams[section - 1];
lm->ngrams[section - 1] = ng;
char *word = strtok(words, " ");
if (word && !strtok(NULL, " ")) { // unigram
lm->vocabulary = realloc(lm->vocabulary, (lm->vocab_size + 1) * sizeof(char*));
lm->vocabulary[lm->vocab_size++] = strdup(word);
}
}
}
fclose(file);
}
void lm_print_properties(const LMFile *lm) {
printf("Header: %s\n", lm->header);
printf("Max n-gram Order: %d\n", lm->max_order);
printf("n-gram Counts:\n");
for (int order = 1; order <= lm->max_order; order++) {
printf(" Order %d: %d\n", order, lm->ngram_counts[order - 1]);
}
printf("Vocabulary: ");
for (int i = 0; i < lm->vocab_size; i++) {
printf("%s%s", lm->vocabulary[i], i < lm->vocab_size - 1 ? ", " : "\n");
}
printf("Special Tokens: %s, %s, %s\n", lm->special_tokens[0], lm->special_tokens[1], lm->special_tokens[2]);
printf("n-grams:\n");
for (int order = 1; order <= lm->max_order; order++) {
printf(" Order %d:\n", order);
Ngram *ng = lm->ngrams[order - 1];
while (ng) {
printf(" Log Prob: %.4f, Sequence: %s, Backoff: %s\n", ng->log_prob, ng->words, ng->backoff >= 0 ? ng->backoff_str : "N/A");
ng = ng->next;
}
}
printf("End Marker: %s\n", lm->end_marker);
}
void lm_write(const LMFile *lm, const char *filepath) {
FILE *file = fopen(filepath, "w");
if (!file) {
perror("Failed to write file");
return;
}
fprintf(file, "%s\n", lm->header);
for (int order = 1; order <= lm->max_order; order++) {
fprintf(file, "ngram %d=%d\n", order, lm->ngram_counts[order - 1]);
}
fprintf(file, "\n");
for (int order = 1; order <= lm->max_order; order++) {
fprintf(file, "\\%d-grams:\n", order);
Ngram *ng = lm->ngrams[order - 1];
while (ng) {
fprintf(file, "%.4f\t%s", ng->log_prob, ng->words);
if (ng->backoff >= 0) fprintf(file, "\t%.4f", ng->backoff);
fprintf(file, "\n");
ng = ng->next;
}
fprintf(file, "\n");
}
fprintf(file, "%s\n", lm->end_marker);
fclose(file);
}
// Example usage:
// int main() {
// LMFile *lm = lm_create();
// lm_open_and_read(lm, "example.lm");
// lm_print_properties(lm);
// lm_write(lm, "output.lm");
// lm_free(lm);
// return 0;
// }