Task 358: .LM File Format

Task 358: .LM File Format

.LM File Format Specifications

The .LM file format refers to the ARPA language model format used in speech recognition systems like CMUSphinx. It is a text-based format for storing statistical n-gram language models, containing probabilities and backoff weights for word sequences. The format is self-delimiting and consists of a header with n-gram counts, followed by sections for each n-gram order, and ends with a marker.

1. List of All Properties Intrinsic to the File Format

  • Header Section (\data): Starts with the line "\data" followed by lines specifying the number of n-grams for each order, e.g., "ngram 1=7" (number of unigrams), "ngram 2=7" (number of bigrams), up to the maximum order.
  • Maximum n-gram Order: The highest n in the ngram counts, determining the model's complexity (e.g., 3 for a trigram model).
  • Vocabulary: All unique words listed in the unigrams section, including special tokens like "" (sentence start), "" (sentence end), and "" (unknown word).
  • n-gram Sections (\n-grams:): For each order n from 1 to max, a section starting with "\n-grams:" containing lines for each n-gram.
  • n-gram Entries: Each line in an n-gram section includes:
  • Log10 probability (negative floating-point number, e.g., -1.0000).
  • Word sequence (space-separated words, e.g., " wood" for a bigram).
  • Optional backoff weight (floating-point number, e.g., -0.2553; present for n-grams where n < max order, used for smoothing unseen sequences).
  • End Marker (\end): The file ends with the line "\end".
  • Probabilities: Log base 10 values (negative, as probabilities are between 0 and 1).
  • Backoff Mechanism: Implicit property where missing n-grams fall back to lower orders using backoff weights (replaced by 1.0 if no backoff listed).
  • Special Tokens: "", "", "" are required for sentence boundaries and handling out-of-vocabulary words.

3. Ghost Blog Embedded HTML JavaScript for Drag-and-Drop .LM File Dump

.LM File Properties Dumper
Drag and drop .LM file here

    

4. Python Class for .LM File Handling

import sys

class LMFile:
    def __init__(self):
        self.header = ''
        self.max_order = 0
        self.ngram_counts = {}
        self.vocabulary = set()
        self.ngrams = {}
        self.special_tokens = ['<s>', '</s>', '<unk>']
        self.end_marker = '\\end\\'

    def open_and_read(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        section = None
        for line in lines:
            if line == '\\data\\':
                section = 'header'
                self.header = line
            elif line.startswith('\\') and line.endswith('-grams:'):
                order = int(line[1:-7])
                self.max_order = max(self.max_order, order)
                section = order
                self.ngrams[section] = []
            elif line == '\\end\\':
                self.end_marker = line
                section = None
            elif section == 'header' and line.startswith('ngram '):
                parts = line.split('=')
                order = int(parts[0].split()[1])
                count = int(parts[1])
                self.ngram_counts[order] = count
            elif isinstance(section, int) and line:
                parts = line.split()
                log_prob = float(parts[0])
                words = parts[1:-1] if len(parts) > 2 and not parts[-1].startswith('-') and not parts[-1].startswith('0') else parts[1:]
                backoff = float(parts[-1]) if len(parts) > len(words) + 1 else None
                self.ngrams[section].append({'log_prob': log_prob, 'words': ' '.join(words), 'backoff': backoff})
                if len(words) == 1:
                    self.vocabulary.add(words[0])

    def print_properties(self):
        print(f"Header: {self.header}")
        print(f"Max n-gram Order: {self.max_order}")
        print("n-gram Counts:")
        for order, count in self.ngram_counts.items():
            print(f"  Order {order}: {count}")
        print(f"Vocabulary: {', '.join(sorted(self.vocabulary))}")
        print(f"Special Tokens: {', '.join(self.special_tokens)}")
        print("n-grams:")
        for order in sorted(self.ngrams):
            print(f"  Order {order}:")
            for ngram in self.ngrams[order]:
                print(f"    Log Prob: {ngram['log_prob']}, Sequence: {ngram['words']}, Backoff: {ngram['backoff'] or 'N/A'}")
        print(f"End Marker: {self.end_marker}")

    def write(self, filepath):
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f"{self.header}\n")
            for order, count in sorted(self.ngram_counts.items()):
                f.write(f"ngram {order}={count}\n")
            f.write("\n")
            for order in sorted(self.ngrams):
                f.write(f"\\{order}-grams:\n")
                for ngram in self.ngrams[order]:
                    line = f"{ngram['log_prob']}\t{ngram['words']}"
                    if ngram['backoff'] is not None:
                        line += f"\t{ngram['backoff']}"
                    f.write(f"{line}\n")
            f.write("\n\\end\\\n")

# Example usage:
# lm = LMFile()
# lm.open_and_read('example.lm')
# lm.print_properties()
# lm.write('output.lm')

5. Java Class for .LM File Handling

import java.io.*;
import java.util.*;

public class LMFile {
    private String header = "";
    private int maxOrder = 0;
    private Map<Integer, Integer> ngramCounts = new TreeMap<>();
    private Set<String> vocabulary = new TreeSet<>();
    private Map<Integer, List<Map<String, Object>>> ngrams = new TreeMap<>();
    private String[] specialTokens = {"<s>", "</s>", "<unk>"};
    private String endMarker = "\\end\\";

    public void openAndRead(String filepath) throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(filepath));
        String line;
        Integer section = null;
        while ((line = reader.readLine()) != null) {
            line = line.trim();
            if (line.isEmpty()) continue;
            if (line.equals("\\data\\")) {
                section = -1; // header
                header = line;
            } else if (line.startsWith("\\") && line.endsWith("-grams:")) {
                section = Integer.parseInt(line.substring(1, line.indexOf('-')));
                maxOrder = Math.max(maxOrder, section);
                ngrams.put(section, new ArrayList<>());
            } else if (line.equals("\\end\\")) {
                endMarker = line;
                section = null;
            } else if (section != null && section == -1 && line.startsWith("ngram ")) {
                String[] parts = line.split("[= ]");
                int order = Integer.parseInt(parts[1]);
                int count = Integer.parseInt(parts[3]);
                ngramCounts.put(order, count);
            } else if (section != null && section > 0 && !line.isEmpty()) {
                String[] parts = line.split("\\s+");
                double logProb = Double.parseDouble(parts[0]);
                StringBuilder wordsBuilder = new StringBuilder();
                for (int i = 1; i < parts.length - (parts.length > 2 && parts[parts.length - 1].matches("-?\\d+(\\.\\d+)?") ? 1 : 0); i++) {
                    wordsBuilder.append(parts[i]).append(" ");
                }
                String words = wordsBuilder.toString().trim();
                Double backoff = (parts.length > words.split(" ").length + 1) ? Double.parseDouble(parts[parts.length - 1]) : null;
                Map<String, Object> ngram = new HashMap<>();
                ngram.put("logProb", logProb);
                ngram.put("words", words);
                ngram.put("backoff", backoff);
                ngrams.get(section).add(ngram);
                if (words.split(" ").length == 1) vocabulary.add(words);
            }
        }
        reader.close();
    }

    public void printProperties() {
        System.out.println("Header: " + header);
        System.out.println("Max n-gram Order: " + maxOrder);
        System.out.println("n-gram Counts:");
        for (Map.Entry<Integer, Integer> entry : ngramCounts.entrySet()) {
            System.out.println("  Order " + entry.getKey() + ": " + entry.getValue());
        }
        System.out.println("Vocabulary: " + String.join(", ", vocabulary));
        System.out.println("Special Tokens: " + String.join(", ", specialTokens));
        System.out.println("n-grams:");
        for (Map.Entry<Integer, List<Map<String, Object>>> orderEntry : ngrams.entrySet()) {
            System.out.println("  Order " + orderEntry.getKey() + ":");
            for (Map<String, Object> ngram : orderEntry.getValue()) {
                System.out.println("    Log Prob: " + ngram.get("logProb") + ", Sequence: " + ngram.get("words") + ", Backoff: " + (ngram.get("backoff") != null ? ngram.get("backoff") : "N/A"));
            }
        }
        System.out.println("End Marker: " + endMarker);
    }

    public void write(String filepath) throws IOException {
        BufferedWriter writer = new BufferedWriter(new FileWriter(filepath));
        writer.write(header + "\n");
        for (Map.Entry<Integer, Integer> entry : ngramCounts.entrySet()) {
            writer.write("ngram " + entry.getKey() + "=" + entry.getValue() + "\n");
        }
        writer.write("\n");
        for (Map.Entry<Integer, List<Map<String, Object>>> orderEntry : ngrams.entrySet()) {
            writer.write("\\" + orderEntry.getKey() + "-grams:\n");
            for (Map<String, Object> ngram : orderEntry.getValue()) {
                String line = ngram.get("logProb") + "\t" + ngram.get("words");
                if (ngram.get("backoff") != null) line += "\t" + ngram.get("backoff");
                writer.write(line + "\n");
            }
            writer.write("\n");
        }
        writer.write(endMarker + "\n");
        writer.close();
    }

    // Example usage:
    // public static void main(String[] args) throws IOException {
    //     LMFile lm = new LMFile();
    //     lm.openAndRead("example.lm");
    //     lm.printProperties();
    //     lm.write("output.lm");
    // }
}

6. JavaScript Class for .LM File Handling

class LMFile {
    constructor() {
        this.header = '';
        this.maxOrder = 0;
        this.ngramCounts = {};
        this.vocabulary = new Set();
        this.ngrams = {};
        this.specialTokens = ['<s>', '</s>', '<unk>'];
        this.endMarker = '\\end\\';
    }

    openAndRead(content) {  // Accepts file content as string
        const lines = content.split('\n').map(line => line.trim()).filter(line => line);
        let section = null;
        lines.forEach(line => {
            if (line === '\\data\\') {
                section = 'header';
                this.header = line;
            } else if (line.startsWith('\\') && line.endsWith('-grams:')) {
                const order = parseInt(line.match(/\\(\d+)-grams:/)[1]);
                this.maxOrder = Math.max(this.maxOrder, order);
                section = order;
                this.ngrams[section] = [];
            } else if (line === '\\end\\') {
                this.endMarker = line;
                section = null;
            } else if (section === 'header' && line.startsWith('ngram ')) {
                const [, order, count] = line.match(/ngram (\d+)=(\d+)/);
                this.ngramCounts[parseInt(order)] = parseInt(count);
            } else if (typeof section === 'number' && line) {
                const parts = line.split(/\s+/);
                const logProb = parseFloat(parts[0]);
                const backoffIndex = parts.length - 1;
                const hasBackoff = !isNaN(parseFloat(parts[backoffIndex])) && parts[backoffIndex].includes('.');
                const words = parts.slice(1, hasBackoff ? backoffIndex : parts.length);
                const backoff = hasBackoff ? parseFloat(parts[backoffIndex]) : null;
                this.ngrams[section].push({ logProb, words: words.join(' '), backoff });
                if (words.length === 1) this.vocabulary.add(words[0]);
            }
        });
    }

    printProperties() {
        let output = `Header: ${this.header}\n`;
        output += `Max n-gram Order: ${this.maxOrder}\n`;
        output += 'n-gram Counts:\n';
        Object.keys(this.ngramCounts).sort((a, b) => a - b).forEach(order => {
            output += `  Order ${order}: ${this.ngramCounts[order]}\n`;
        });
        output += `Vocabulary: ${Array.from(this.vocabulary).sort().join(', ')}\n`;
        output += `Special Tokens: ${this.specialTokens.join(', ')}\n`;
        output += 'n-grams:\n';
        Object.keys(this.ngrams).sort((a, b) => a - b).forEach(order => {
            output += `  Order ${order}:\n`;
            this.ngrams[order].forEach(ngram => {
                output += `    Log Prob: ${ngram.logProb}, Sequence: ${ngram.words}, Backoff: ${ngram.backoff !== null ? ngram.backoff : 'N/A'}\n`;
            });
        });
        output += `End Marker: ${this.endMarker}\n`;
        console.log(output);
    }

    write() {
        let content = `${this.header}\n`;
        Object.keys(this.ngramCounts).sort((a, b) => a - b).forEach(order => {
            content += `ngram ${order}=${this.ngramCounts[order]}\n`;
        });
        content += '\n';
        Object.keys(this.ngrams).sort((a, b) => a - b).forEach(order => {
            content += `\\${order}-grams:\n`;
            this.ngrams[order].forEach(ngram => {
                let line = `${ngram.logProb}\t${ngram.words}`;
                if (ngram.backoff !== null) line += `\t${ngram.backoff}`;
                content += `${line}\n`;
            });
            content += '\n';
        });
        content += `${this.endMarker}\n`;
        return content;  // Return string for writing to file (e.g., via fs in Node.js)
    }
}

// Example usage in Node.js:
// const fs = require('fs');
// const lm = new LMFile();
// const content = fs.readFileSync('example.lm', 'utf8');
// lm.openAndRead(content);
// lm.printProperties();
// fs.writeFileSync('output.lm', lm.write());

7. C Class for .LM File Handling

Note: In C, we use structs instead of classes for simplicity. This implementation uses dynamic memory and assumes basic error handling.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_LINE 1024
#define MAX_WORDS 10000

typedef struct Ngram {
    double log_prob;
    char *words;
    double backoff;  // -1 if N/A
    struct Ngram *next;
} Ngram;

typedef struct {
    char *header;
    int max_order;
    int *ngram_counts;  // array indexed by order-1
    char **vocabulary;
    int vocab_size;
    char *special_tokens[3];
    Ngram **ngrams;  // array of lists, indexed by order-1
    char *end_marker;
} LMFile;

LMFile *lm_create() {
    LMFile *lm = malloc(sizeof(LMFile));
    lm->header = strdup("\\data\\");
    lm->max_order = 0;
    lm->ngram_counts = NULL;
    lm->vocabulary = NULL;
    lm->vocab_size = 0;
    lm->special_tokens[0] = strdup("<s>");
    lm->special_tokens[1] = strdup("</s>");
    lm->special_tokens[2] = strdup("<unk>");
    lm->ngrams = NULL;
    lm->end_marker = strdup("\\end\\");
    return lm;
}

void lm_free(LMFile *lm) {
    free(lm->header);
    free(lm->ngram_counts);
    for (int i = 0; i < lm->vocab_size; i++) free(lm->vocabulary[i]);
    free(lm->vocabulary);
    for (int i = 0; i < 3; i++) free(lm->special_tokens[i]);
    for (int order = 0; order < lm->max_order; order++) {
        Ngram *ng = lm->ngrams[order];
        while (ng) {
            Ngram *next = ng->next;
            free(ng->words);
            free(ng);
            ng = next;
        }
    }
    free(lm->ngrams);
    free(lm->end_marker);
    free(lm);
}

void lm_open_and_read(LMFile *lm, const char *filepath) {
    FILE *file = fopen(filepath, "r");
    if (!file) {
        perror("Failed to open file");
        return;
    }
    char line[MAX_LINE];
    int section = -1;  // -1: none, 0: header, >0: order
    while (fgets(line, MAX_LINE, file)) {
        char *trimmed = strtok(line, "\r\n");
        if (!trimmed || strlen(trimmed) == 0) continue;
        if (strcmp(trimmed, "\\data\\") == 0) {
            section = 0;
            lm->header = strdup(trimmed);
        } else if (strstr(trimmed, "-grams:")) {
            sscanf(trimmed, "\\%d-grams:", &section);
            lm->max_order = section > lm->max_order ? section : lm->max_order;
            if (!lm->ngrams) lm->ngrams = calloc(lm->max_order, sizeof(Ngram*));
            if (!lm->ngram_counts) lm->ngram_counts = calloc(lm->max_order, sizeof(int));
        } else if (strcmp(trimmed, "\\end\\") == 0) {
            lm->end_marker = strdup(trimmed);
            section = -1;
        } else if (section == 0 && strstr(trimmed, "ngram ")) {
            int order, count;
            sscanf(trimmed, "ngram %d=%d", &order, &count);
            lm->ngram_counts[order - 1] = count;
        } else if (section > 0) {
            double log_prob, backoff = -1.0;
            char words[MAX_LINE];
            char *token = strtok(trimmed, " \t");
            log_prob = atof(token);
            char *word_start = trimmed + strlen(token) + 1;
            char *backoff_str = strrchr(word_start, '\t');
            if (backoff_str && isdigit(backoff_str[1]) || backoff_str[1] == '-') {
                backoff = atof(backoff_str + 1);
                *backoff_str = '\0';
            }
            strcpy(words, word_start);
            Ngram *ng = malloc(sizeof(Ngram));
            ng->log_prob = log_prob;
            ng->words = strdup(words);
            ng->backoff = backoff;
            ng->next = lm->ngrams[section - 1];
            lm->ngrams[section - 1] = ng;
            char *word = strtok(words, " ");
            if (word && !strtok(NULL, " ")) {  // unigram
                lm->vocabulary = realloc(lm->vocabulary, (lm->vocab_size + 1) * sizeof(char*));
                lm->vocabulary[lm->vocab_size++] = strdup(word);
            }
        }
    }
    fclose(file);
}

void lm_print_properties(const LMFile *lm) {
    printf("Header: %s\n", lm->header);
    printf("Max n-gram Order: %d\n", lm->max_order);
    printf("n-gram Counts:\n");
    for (int order = 1; order <= lm->max_order; order++) {
        printf("  Order %d: %d\n", order, lm->ngram_counts[order - 1]);
    }
    printf("Vocabulary: ");
    for (int i = 0; i < lm->vocab_size; i++) {
        printf("%s%s", lm->vocabulary[i], i < lm->vocab_size - 1 ? ", " : "\n");
    }
    printf("Special Tokens: %s, %s, %s\n", lm->special_tokens[0], lm->special_tokens[1], lm->special_tokens[2]);
    printf("n-grams:\n");
    for (int order = 1; order <= lm->max_order; order++) {
        printf("  Order %d:\n", order);
        Ngram *ng = lm->ngrams[order - 1];
        while (ng) {
            printf("    Log Prob: %.4f, Sequence: %s, Backoff: %s\n", ng->log_prob, ng->words, ng->backoff >= 0 ? ng->backoff_str : "N/A");
            ng = ng->next;
        }
    }
    printf("End Marker: %s\n", lm->end_marker);
}

void lm_write(const LMFile *lm, const char *filepath) {
    FILE *file = fopen(filepath, "w");
    if (!file) {
        perror("Failed to write file");
        return;
    }
    fprintf(file, "%s\n", lm->header);
    for (int order = 1; order <= lm->max_order; order++) {
        fprintf(file, "ngram %d=%d\n", order, lm->ngram_counts[order - 1]);
    }
    fprintf(file, "\n");
    for (int order = 1; order <= lm->max_order; order++) {
        fprintf(file, "\\%d-grams:\n", order);
        Ngram *ng = lm->ngrams[order - 1];
        while (ng) {
            fprintf(file, "%.4f\t%s", ng->log_prob, ng->words);
            if (ng->backoff >= 0) fprintf(file, "\t%.4f", ng->backoff);
            fprintf(file, "\n");
            ng = ng->next;
        }
        fprintf(file, "\n");
    }
    fprintf(file, "%s\n", lm->end_marker);
    fclose(file);
}

// Example usage:
// int main() {
//     LMFile *lm = lm_create();
//     lm_open_and_read(lm, "example.lm");
//     lm_print_properties(lm);
//     lm_write(lm, "output.lm");
//     lm_free(lm);
//     return 0;
// }