mammoth.js/lib/docx/body-reader.js
2016-05-12 22:10:57 +01:00

473 lines
15 KiB
JavaScript

exports.BodyReader = BodyReader;
var _ = require("underscore");
var documents = require("../documents");
var Result = require("../results").Result;
var warning = require("../results").warning;
function BodyReader(options) {
var relationships = options.relationships;
var contentTypes = options.contentTypes;
var docxFile = options.docxFile;
var files = options.files;
var numbering = options.numbering;
var styles = options.styles;
function readXmlElements(elements) {
var results = elements.map(readXmlElement);
return combineResults(results);
}
function readXmlElement(element) {
if (element.type === "element") {
var handler = xmlElementReaders[element.name];
if (handler) {
return handler(element);
} else if (!Object.prototype.hasOwnProperty.call(ignoreElements, element.name)) {
var message = warning("An unrecognised element was ignored: " + element.name);
return emptyResultWithMessages([message]);
}
}
return emptyResult();
}
function readRunProperties(element) {
var properties = {
type: "runProperties"
};
var verticalAlignmentElement = element.first("w:vertAlign");
if (verticalAlignmentElement) {
properties.verticalAlignment = verticalAlignmentElement.attributes["w:val"];
}
properties.isBold = !!element.first("w:b");
properties.isUnderline = !!element.first("w:u");
properties.isItalic = !!element.first("w:i");
properties.isStrikethrough = !!element.first("w:strike");
return readRunStyle(element).map(function(style) {
properties.styleId = style.styleId;
properties.styleName = style.name;
return properties;
});
}
function readParagraphStyle(element) {
return readStyle(element, "w:pStyle", "Paragraph", styles.findParagraphStyleById);
}
function readRunStyle(element) {
return readStyle(element, "w:rStyle", "Run", styles.findCharacterStyleById);
}
function readStyle(element, styleTagName, styleType, findStyleById) {
var messages = [];
var styleElement = element.first(styleTagName);
var styleId = null;
var name = null;
if (styleElement) {
styleId = styleElement.attributes["w:val"];
if (styleId) {
var style = findStyleById(styleId);
if (style) {
name = style.name;
} else {
messages.push(undefinedStyleWarning(styleType, styleId));
}
}
}
return elementResultWithMessages({styleId: styleId, name: name}, messages);
}
function noteReferenceReader(noteType) {
return function(element) {
var noteId = element.attributes["w:id"];
return elementResult(new documents.NoteReference({
noteType: noteType,
noteId: noteId
}));
};
}
function readChildElements(element) {
return readXmlElements(element.children);
}
var xmlElementReaders = {
"w:p": function(element) {
return readXmlElements(element.children)
.map(function(children) {
var properties = _.find(children, isParagraphProperties);
return new documents.Paragraph(
children.filter(negate(isParagraphProperties)),
properties
);
})
.insertExtra();
},
"w:pPr": function(element) {
var properties = {
type: "paragraphProperties"
};
var alignElement = element.first("w:jc");
if (alignElement) {
properties.alignment = alignElement.attributes["w:val"];
}
properties.numbering = readNumberingProperties(element.firstOrEmpty("w:numPr"));
return readParagraphStyle(element).map(function(style) {
properties.styleId = style.styleId;
properties.styleName = style.name;
return properties;
});
},
"w:r": function(element) {
return readXmlElements(element.children)
.map(function(children) {
var properties = _.find(children, isRunProperties);
return new documents.Run(
children.filter(negate(isRunProperties)),
properties
);
});
},
"w:rPr": readRunProperties,
"w:t": function(element) {
return elementResult(new documents.Text(element.text()));
},
"w:tab": function(element) {
return elementResult(new documents.Tab());
},
"w:hyperlink": function(element) {
var relationshipId = element.attributes["r:id"];
var anchor = element.attributes["w:anchor"];
return readXmlElements(element.children).map(function(children) {
if (relationshipId) {
var href = relationships[relationshipId].target;
return new documents.Hyperlink(children, {href: href});
} else if (anchor) {
return new documents.Hyperlink(children, {anchor: anchor});
} else {
return children;
}
});
},
"w:tbl": readTable,
"w:tr": readTableRow,
"w:tc": readTableCell,
"w:footnoteReference": noteReferenceReader("footnote"),
"w:endnoteReference": noteReferenceReader("endnote"),
"w:br": function(element) {
var breakType = element.attributes["w:type"];
if (breakType) {
return emptyResultWithMessages([warning("Unsupported break type: " + breakType)]);
} else {
return elementResult(new documents.LineBreak());
}
},
"w:bookmarkStart": function(element){
var name = element.attributes["w:name"];
if (name === "_GoBack") {
return emptyResult();
} else {
return elementResult(new documents.BookmarkStart({name: name}));
}
},
"mc:AlternateContent": function(element) {
return readChildElements(element.first("mc:Fallback"));
},
"w:ins": readChildElements,
"w:smartTag": readChildElements,
"w:drawing": readChildElements,
"w:pict": function(element) {
return readChildElements(element).toExtra();
},
"v:roundrect": readChildElements,
"v:shape": readChildElements,
"v:textbox": readChildElements,
"w:txbxContent": readChildElements,
"wp:inline": readDrawingElement,
"wp:anchor": readDrawingElement,
"v:imagedata": readImageData
};
return {
readXmlElement: readXmlElement,
readXmlElements: readXmlElements,
_readNumberingProperties: readNumberingProperties
};
function readNumberingProperties(element) {
var level = element.firstOrEmpty("w:ilvl").attributes["w:val"];
var numId = element.firstOrEmpty("w:numId").attributes["w:val"];
if (level === undefined || numId === undefined) {
return null;
} else {
return numbering.findLevel(numId, level);
}
}
function readTable(element) {
return readXmlElements(element.children)
.flatMap(calculateRowSpans)
.map(documents.Table);
}
function readTableRow(element) {
return readXmlElements(element.children).map(documents.TableRow);
}
function readTableCell(element) {
return readXmlElements(element.children).map(function(children) {
var properties = element.firstOrEmpty("w:tcPr");
var gridSpan = properties.firstOrEmpty("w:gridSpan").attributes["w:val"];
var colSpan = gridSpan ? parseInt(gridSpan) : 1;
var cell = documents.TableCell(children, {colSpan: colSpan});
cell._vMerge = readVMerge(properties);
return cell;
});
}
function readVMerge(properties) {
var element = properties.first("w:vMerge");
if (element) {
var val = element.attributes["w:val"];
return val === "continue" || !val;
} else {
return null;
}
}
function calculateRowSpans(rows) {
var unexpectedNonRows = _.any(rows, function(row) {
return row.type !== documents.types.tableRow;
});
if (unexpectedNonRows) {
return elementResultWithMessages(rows, [warning(
"unexpected non-row element in table, cell merging may be incorrect"
)]);
}
var unexpectedNonCells = _.any(rows, function(row) {
return _.any(row.children, function(cell) {
return cell.type !== documents.types.tableCell;
});
});
if (unexpectedNonCells) {
return elementResultWithMessages(rows, [warning(
"unexpected non-cell element in table row, cell merging may be incorrect"
)]);
}
var columns = {};
rows.forEach(function(row) {
var cellIndex = 0;
row.children.forEach(function(cell) {
if (cell._vMerge && columns[cellIndex]) {
columns[cellIndex].rowSpan++;
} else {
columns[cellIndex] = cell;
cell._vMerge = false;
}
cellIndex += cell.colSpan;
});
});
rows.forEach(function(row) {
row.children = row.children.filter(function(cell) {
return !cell._vMerge;
});
row.children.forEach(function(cell) {
delete cell._vMerge;
});
});
return elementResult(rows);
}
function readDrawingElement(element) {
var blips = element
.getElementsByTagName("a:graphic")
.getElementsByTagName("a:graphicData")
.getElementsByTagName("pic:pic")
.getElementsByTagName("pic:blipFill")
.getElementsByTagName("a:blip");
return combineResults(blips.map(readBlip.bind(null, element)));
}
function readBlip(element, blip) {
var altText = element.first("wp:docPr").attributes.descr;
return readImage(findBlipImageFile(blip), altText);
}
function findBlipImageFile(blip) {
var embedRelationshipId = blip.attributes["r:embed"];
var linkRelationshipid = blip.attributes["r:link"];
if (embedRelationshipId) {
return findEmbeddedImageFile(embedRelationshipId);
} else {
var imagePath = relationships[linkRelationshipid].target;
return {
path: imagePath,
read: files.read.bind(files, imagePath)
};
}
}
function readImageData(element) {
return readImage(
findEmbeddedImageFile(element.attributes["r:id"]),
element.attributes["o:title"]);
}
function findEmbeddedImageFile(relationshipId) {
var path = joinZipPath("word", relationships[relationshipId].target);
return {
path: path,
read: docxFile.read.bind(docxFile, path)
};
}
function readImage(imageFile, altText) {
var contentType = contentTypes.findContentType(imageFile.path);
var image = documents.Image({
readImage: imageFile.read,
altText: altText,
contentType: contentType
});
var warnings = supportedImageTypes[contentType] ?
[] : warning("Image of type " + contentType + " is unlikely to display in web browsers");
return elementResultWithMessages(image, warnings);
}
function undefinedStyleWarning(type, styleId) {
return warning(
type + " style with ID " + styleId + " was referenced but not defined in the document");
}
}
var supportedImageTypes = {
"image/png": true,
"image/gif": true,
"image/jpeg": true,
"image/svg+xml": true,
"image/tiff": true
};
var ignoreElements = {
"office-word:wrap": true,
"v:shadow": true,
"v:shapetype": true,
"w:bookmarkEnd": true,
"w:sectPr": true,
"w:proofErr": true,
"w:lastRenderedPageBreak": true,
"w:commentRangeStart": true,
"w:commentRangeEnd": true,
"w:commentReference": true,
"w:del": true,
"w:footnoteRef": true,
"w:endnoteRef": true,
"w:tblPr": true,
"w:tblGrid": true,
"w:tcPr": true
};
function isParagraphProperties(element) {
return element.type === "paragraphProperties";
}
function isRunProperties(element) {
return element.type === "runProperties";
}
function negate(predicate) {
return function(value) {
return !predicate(value);
};
}
function joinZipPath(first, second) {
// In general, we should check first and second for trailing and leading slashes,
// but in our specific case this seems to be sufficient
return first + "/" + second;
}
function emptyResultWithMessages(messages) {
return new ReadResult(null, null, messages);
}
function emptyResult() {
return new ReadResult(null);
}
function elementResult(element) {
return new ReadResult(element);
}
function elementResultWithMessages(element, messages) {
return new ReadResult(element, null, messages);
}
function ReadResult(element, extra, messages) {
this.value = element || [];
this.extra = extra;
this._result = new Result({
element: this.value,
extra: extra
}, messages);
this.messages = this._result.messages;
}
ReadResult.prototype.toExtra = function() {
return new ReadResult(null, joinElements(this.extra, this.value), this.messages);
};
ReadResult.prototype.insertExtra = function() {
var extra = this.extra;
if (extra && extra.length) {
return new ReadResult(joinElements(this.value, extra), null, this.messages);
} else {
return this;
}
};
ReadResult.prototype.map = function(func) {
var result = this._result.map(function(value) {
return func(value.element);
});
return new ReadResult(result.value, this.extra, result.messages);
};
ReadResult.prototype.flatMap = function(func) {
var result = this._result.flatMap(function(value) {
return func(value.element)._result;
});
return new ReadResult(result.value.element, joinElements(this.extra, result.value.extra), result.messages);
};
function combineResults(results) {
var result = Result.combine(_.pluck(results, "_result"));
return new ReadResult(
_.flatten(_.pluck(result.value, "element")),
_.filter(_.flatten(_.pluck(result.value, "extra")), function(x) { return x; }),
result.messages
);
}
function joinElements(first, second) {
return _.flatten([first, second]);
}