Task 148: .DOCX File Format
Task 148: .DOCX File Format
The .DOCX file format adheres to the Office Open XML (OOXML) standard, as defined in ISO/IEC 29500 and ECMA-376, with extensions documented by Microsoft for Word-specific features. This format consists of a ZIP archive containing XML files and binary resources, enabling structured representation of word processing documents.
The properties intrinsic to the .DOCX file format, as defined in its core metadata (stored in the docProps/core.xml file within the ZIP structure), include the following:
- category: A categorization of the document content.
- contentStatus: The status of the document content (e.g., "Draft" or "Final").
- contentType: The type of content in the document.
- created: The date and time the document was created.
- creator: The name of the document's author or creator.
- description: A textual description or abstract of the document.
- identifier: A unique identifier for the document.
- keywords: Keywords associated with the document.
- language: The primary language of the document.
- lastModifiedBy: The name of the user who last modified the document.
- lastPrinted: The date and time the document was last printed.
- modified: The date and time the document was last modified.
- revision: The revision number of the document.
- subject: The subject or topic of the document.
- title: The title of the document.
- version: The version number of the document.
Two direct download links for sample .DOCX files are:
- https://calibre-ebook.com/downloads/demos/demo.docx
- https://www2.hu-berlin.de/stadtlabor/wp-content/uploads/2021/12/sample3.docx
The following is an embeddable HTML snippet with JavaScript suitable for a Ghost blog post. It enables drag-and-drop functionality for a .DOCX file, unzips it using JSZip (loaded via CDN), parses the core.xml file, and displays the properties on the screen. Include this in the blog's HTML embed block.
- The following Python class handles opening a .DOCX file, decoding (unzipping), reading and writing properties from docProps/core.xml, and printing them to the console. It uses standard libraries (zipfile and xml.etree.ElementTree). To write, it updates the XML and re-creates the ZIP with modifications.
import zipfile
import xml.etree.ElementTree as ET
from io import BytesIO
import os
from datetime import datetime
class DocxProperties:
def __init__(self, filepath):
self.filepath = filepath
self.properties = self._read_properties()
def _read_properties(self):
with zipfile.ZipFile(self.filepath, 'r') as z:
if 'docProps/core.xml' not in z.namelist():
raise ValueError("No core.xml found in DOCX file.")
with z.open('docProps/core.xml') as f:
tree = ET.parse(f)
root = tree.getroot()
ns = {
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
props = {}
for tag in ['category', 'contentStatus', 'contentType', 'lastModifiedBy', 'lastPrinted', 'revision', 'subject', 'title', 'version']:
elem = root.find(f'cp:{tag}', ns) or root.find(f'dc:{tag}', ns) or root.find(f'dcterms:{tag}', ns)
if elem is not None:
props[tag] = elem.text
for tag in ['created', 'modified']:
elem = root.find(f'dcterms:{tag}', ns)
if elem is not None:
props[tag] = elem.text
for tag in ['creator', 'description', 'identifier', 'keywords', 'language']:
elem = root.find(f'dc:{tag}', ns)
if elem is not None:
props[tag] = elem.text
return props
def print_properties(self):
for key, value in self.properties.items():
print(f"{key}: {value}")
def set_property(self, key, value):
if key in self.properties:
self.properties[key] = value
else:
raise ValueError(f"Invalid property: {key}")
def write_properties(self, output_path=None):
if output_path is None:
output_path = self.filepath
with zipfile.ZipFile(self.filepath, 'r') as z_in:
with zipfile.ZipFile(output_path, 'w') as z_out:
for item in z_in.infolist():
if item.filename == 'docProps/core.xml':
tree = ET.ElementTree(ET.Element('{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}coreProperties'))
root = tree.getroot()
ns = {
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
'dc': 'http://purl.org/dc/elements/1.1/',
'dcterms': 'http://purl.org/dc/terms/',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
ET.register_namespace('cp', ns['cp'])
ET.register_namespace('dc', ns['dc'])
ET.register_namespace('dcterms', ns['dcterms'])
ET.register_namespace('xsi', ns['xsi'])
for key, value in self.properties.items():
if key in ['created', 'modified']:
elem = ET.SubElement(root, f'{{{ns["dcterms"]}}}{key}')
elem.set(f'{{{ns["xsi"]}}}type', 'dcterms:W3CDTF')
elem.text = value
elif key in ['creator', 'description', 'identifier', 'keywords', 'language']:
ET.SubElement(root, f'{{{ns["dc"]}}}{key}').text = value
else:
ET.SubElement(root, f'{{{ns["cp"]}}}{key}').text = value
buffer = BytesIO()
tree.write(buffer, encoding='utf-8', xml_declaration=True)
z_out.writestr(item.filename, buffer.getvalue())
else:
z_out.writestr(item, z_in.read(item.filename))
- The following Java class performs similar operations using java.util.zip and javax.xml.parsers. It reads, writes, and prints properties. Compilation requires JDK; no external dependencies.
import java.io.*;
import java.util.zip.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.util.HashMap;
import java.util.Map;
public class DocxProperties {
private String filepath;
private Map<String, String> properties;
public DocxProperties(String filepath) {
this.filepath = filepath;
this.properties = readProperties();
}
private Map<String, String> readProperties() {
Map<String, String> props = new HashMap<>();
try (ZipFile z = new ZipFile(filepath)) {
ZipEntry entry = z.getEntry("docProps/core.xml");
if (entry == null) {
throw new IOException("No core.xml found.");
}
try (InputStream is = z.getInputStream(entry)) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(new InputSource(is));
String[] tags = {"category", "contentStatus", "contentType", "created", "creator", "description",
"identifier", "keywords", "language", "lastModifiedBy", "lastPrinted", "modified",
"revision", "subject", "title", "version"};
for (String tag : tags) {
NodeList nl = doc.getElementsByTagNameNS("*", tag);
if (nl.getLength() > 0) {
props.put(tag, nl.item(0).getTextContent());
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return props;
}
public void printProperties() {
for (Map.Entry<String, String> entry : properties.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue());
}
}
public void setProperty(String key, String value) {
if (properties.containsKey(key)) {
properties.put(key, value);
} else {
throw new IllegalArgumentException("Invalid property: " + key);
}
}
public void writeProperties(String outputPath) {
if (outputPath == null) {
outputPath = filepath;
}
try (ZipFile zIn = new ZipFile(filepath);
ZipOutputStream zOut = new ZipOutputStream(new FileOutputStream(outputPath))) {
for (java.util.Enumeration<? extends ZipEntry> entries = zIn.entries(); entries.hasMoreElements(); ) {
ZipEntry entry = entries.nextElement();
if (entry.getName().equals("docProps/core.xml")) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.newDocument();
Element root = doc.createElementNS("http://schemas.openxmlformats.org/package/2006/metadata/core-properties", "cp:coreProperties");
root.setAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/");
root.setAttribute("xmlns:dcterms", "http://purl.org/dc/terms/");
root.setAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
doc.appendChild(root);
for (Map.Entry<String, String> prop : properties.entrySet()) {
String key = prop.getKey();
String nsUri = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties";
String prefix = "cp";
if (key.equals("created") || key.equals("modified")) {
nsUri = "http://purl.org/dc/terms/";
prefix = "dcterms";
Element elem = doc.createElementNS(nsUri, prefix + ":" + key);
elem.setAttributeNS("http://www.w3.org/2001/XMLSchema-instance", "xsi:type", "dcterms:W3CDTF");
elem.setTextContent(prop.getValue());
root.appendChild(elem);
} else if (key.equals("creator") || key.equals("description") || key.equals("identifier") ||
key.equals("keywords") || key.equals("language")) {
nsUri = "http://purl.org/dc/elements/1.1/";
prefix = "dc";
Element elem = doc.createElementNS(nsUri, prefix + ":" + key);
elem.setTextContent(prop.getValue());
root.appendChild(elem);
} else {
Element elem = doc.createElementNS(nsUri, prefix + ":" + key);
elem.setTextContent(prop.getValue());
root.appendChild(elem);
}
}
TransformerFactory tf = TransformerFactory.newInstance();
Transformer t = tf.newTransformer();
t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
t.transform(new DOMSource(doc), new StreamResult(baos));
ZipEntry newEntry = new ZipEntry(entry.getName());
zOut.putNextEntry(newEntry);
zOut.write(baos.toByteArray());
zOut.closeEntry();
} else {
ZipEntry newEntry = new ZipEntry(entry.getName());
zOut.putNextEntry(newEntry);
try (InputStream is = zIn.getInputStream(entry)) {
byte[] buffer = new byte[1024];
int len;
while ((len = is.read(buffer)) > 0) {
zOut.write(buffer, 0, len);
}
}
zOut.closeEntry();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
- The following JavaScript class (ES6) handles similar operations in a browser or Node.js environment. It requires JSZip for unzipping and assumes a DOMParser (browser) or xml2js for Node (not included). For writing, it updates and generates a new Blob for download.
const JSZip = require('jszip'); // For Node; in browser, use script tag as in item 3
class DocxProperties {
constructor(file) {
this.file = file; // File object or Buffer in Node
this.properties = {};
}
async readProperties() {
const zip = await JSZip.loadAsync(this.file);
const coreXml = await zip.file('docProps/core.xml')?.async('string');
if (!coreXml) {
throw new Error('No core.xml found.');
}
const parser = new DOMParser();
const xmlDoc = parser.parseFromString(coreXml, 'application/xml');
const ns = {
cp: 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
dc: 'http://purl.org/dc/elements/1.1/',
dcterms: 'http://purl.org/dc/terms/'
};
const root = xmlDoc.getElementsByTagNameNS(ns.cp, 'coreProperties')[0];
if (root) {
['category', 'contentStatus', 'contentType', 'lastModifiedBy', 'lastPrinted', 'revision', 'subject', 'title', 'version'].forEach(tag => {
const elem = root.getElementsByTagNameNS(ns.cp, tag)[0] || root.getElementsByTagNameNS(ns.dc, tag)[0] || root.getElementsByTagNameNS(ns.dcterms, tag)[0];
if (elem) this.properties[tag] = elem.textContent;
});
['created', 'modified'].forEach(tag => {
const elem = root.getElementsByTagNameNS(ns.dcterms, tag)[0];
if (elem) this.properties[tag] = elem.textContent;
});
['creator', 'description', 'identifier', 'keywords', 'language'].forEach(tag => {
const elem = root.getElementsByTagNameNS(ns.dc, tag)[0];
if (elem) this.properties[tag] = elem.textContent;
});
}
return this.properties;
}
printProperties() {
for (const [key, value] of Object.entries(this.properties)) {
console.log(`${key}: ${value}`);
}
}
setProperty(key, value) {
if (key in this.properties) {
this.properties[key] = value;
} else {
throw new Error(`Invalid property: ${key}`);
}
}
async writeProperties() {
const zip = await JSZip.loadAsync(this.file);
const xmlHeader = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>';
let coreXml = `${xmlHeader}<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">`;
for (const [key, value] of Object.entries(this.properties)) {
if (['created', 'modified'].includes(key)) {
coreXml += `<dcterms:${key} xsi:type="dcterms:W3CDTF">${value}</dcterms:${key}>`;
} else if (['creator', 'description', 'identifier', 'keywords', 'language'].includes(key)) {
coreXml += `<dc:${key}>${value}</dc:${key}>`;
} else {
coreXml += `<cp:${key}>${value}</cp:${key}>`;
}
}
coreXml += '</cp:coreProperties>';
zip.file('docProps/core.xml', coreXml);
const newBlob = await zip.generateAsync({type: 'blob'});
// In browser: const url = URL.createObjectURL(newBlob); then download
return newBlob; // Or save in Node
}
}
- The following C++ class (assuming C++11+) uses libzip (requires linking -lzip) and TinyXML2 (external library; include headers) for ZIP and XML handling. It reads, writes, and prints properties. Compile with appropriate flags.
#include <zip.h>
#include <tinyxml2.h>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include <fstream>
class DocxProperties {
private:
std::string filepath;
std::map<std::string, std::string> properties;
void readProperties() {
zip_t* z = zip_open(filepath.c_str(), ZIP_RDONLY, nullptr);
if (!z) {
throw std::runtime_error("Failed to open ZIP.");
}
zip_file_t* f = zip_fopen(z, "docProps/core.xml", 0);
if (!f) {
zip_close(z);
throw std::runtime_error("No core.xml found.");
}
zip_stat_t sb;
zip_stat(z, "docProps/core.xml", 0, &sb);
std::vector<char> buffer(sb.size);
zip_fread(f, buffer.data(), sb.size);
zip_fclose(f);
zip_close(z);
tinyxml2::XMLDocument doc;
doc.Parse(buffer.data(), sb.size);
tinyxml2::XMLElement* root = doc.FirstChildElement("cp:coreProperties");
if (root) {
const char* ns[] = {"cp", "dc", "dcterms"};
std::string tags[] = {"category", "contentStatus", "contentType", "created", "creator", "description",
"identifier", "keywords", "language", "lastModifiedBy", "lastPrinted", "modified",
"revision", "subject", "title", "version"};
for (const auto& tag : tags) {
tinyxml2::XMLElement* elem = nullptr;
for (const auto& prefix : ns) {
std::string qualified = std::string(prefix) + ":" + tag;
elem = root->FirstChildElement(qualified.c_str());
if (elem) break;
}
if (elem && elem->GetText()) {
properties[tag] = elem->GetText();
}
}
}
}
public:
DocxProperties(const std::string& filepath) : filepath(filepath) {
readProperties();
}
void printProperties() {
for (const auto& pair : properties) {
std::cout << pair.first << ": " << pair.second << std::endl;
}
}
void setProperty(const std::string& key, const std::string& value) {
if (properties.find(key) != properties.end()) {
properties[key] = value;
} else {
throw std::invalid_argument("Invalid property: " + key);
}
}
void writeProperties(const std::string& outputPath) {
zip_t* zIn = zip_open(filepath.c_str(), ZIP_RDONLY, nullptr);
if (!zIn) {
throw std::runtime_error("Failed to open input ZIP.");
}
zip_t* zOut = zip_open(outputPath.c_str(), ZIP_CREATE | ZIP_TRUNCATE, nullptr);
if (!zOut) {
zip_close(zIn);
throw std::runtime_error("Failed to create output ZIP.");
}
int numEntries = zip_get_num_entries(zIn, 0);
for (int i = 0; i < numEntries; ++i) {
const char* name = zip_get_name(zIn, i, 0);
if (std::string(name) == "docProps/core.xml") {
tinyxml2::XMLDocument doc;
tinyxml2::XMLElement* root = doc.NewElement("cp:coreProperties");
root->SetAttribute("xmlns:cp", "http://schemas.openxmlformats.org/package/2006/metadata/core-properties");
root->SetAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/");
root->SetAttribute("xmlns:dcterms", "http://purl.org/dc/terms/");
root->SetAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
doc.InsertFirstChild(root);
for (const auto& pair : properties) {
const std::string& key = pair.first;
const std::string& value = pair.second;
tinyxml2::XMLElement* elem;
if (key == "created" || key == "modified") {
elem = doc.NewElement(("dcterms:" + key).c_str());
elem->SetAttribute("xsi:type", "dcterms:W3CDTF");
} else if (key == "creator" || key == "description" || key == "identifier" ||
key == "keywords" || key == "language") {
elem = doc.NewElement(("dc:" + key).c_str());
} else {
elem = doc.NewElement(("cp:" + key).c_str());
}
elem->SetText(value.c_str());
root->InsertEndChild(elem);
}
tinyxml2::XMLPrinter printer;
doc.Print(&printer);
std::string xmlStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" + std::string(printer.CStr());
zip_source_t* src = zip_source_buffer(zOut, xmlStr.c_str(), xmlStr.size(), 0);
if (zip_file_add(zOut, name, src, ZIP_FL_OVERWRITE | ZIP_FL_ENC_UTF_8) < 0) {
zip_source_free(src);
throw std::runtime_error("Failed to add core.xml.");
}
} else {
zip_source_t* src = zip_source_zip(zOut, zIn, i, 0, 0, -1);
if (zip_file_add(zOut, name, src, ZIP_FL_ENC_GUESS) < 0) {
zip_source_free(src);
throw std::runtime_error("Failed to copy file.");
}
}
}
zip_close(zOut);
zip_close(zIn);
}
};