mirror of
https://github.com/foliojs/pdfkit.git
synced 2025-12-08 20:15:54 +00:00
* Fix further LineWrapper precision issues * add test of bounded text precision issue * add rowSpanning table example * add failure threshold * implement toContainText jest matcher * create a unit test for bounded text precision * remove round up rounding code path --------- Co-authored-by: Luiz Américo Pereira Câmara <blikblum@users.noreply.github.com>
122 lines
3.0 KiB
JavaScript
122 lines
3.0 KiB
JavaScript
/**
|
|
* @import PDFDocument from '../../lib/document';
|
|
*/
|
|
|
|
/**
|
|
* @typedef {object} TextStream
|
|
* @property {string} text
|
|
* @property {string} font
|
|
* @property {number} fontSize
|
|
*
|
|
* @typedef {string | Buffer} PDFDataItem
|
|
* @typedef {Array<PDFDataItem>} PDFData
|
|
*
|
|
* @typedef {object} PDFDataObject
|
|
* @property {PDFDataItem[]} items
|
|
*/
|
|
|
|
/**
|
|
* @param {PDFDocument} doc
|
|
* @return {PDFData}
|
|
*/
|
|
function logData(doc) {
|
|
const loggedData = [];
|
|
const originalMethod = doc._write;
|
|
doc._write = function (data) {
|
|
loggedData.push(data);
|
|
originalMethod.call(this, data);
|
|
};
|
|
return loggedData;
|
|
}
|
|
|
|
function escapeRegExp(string) {
|
|
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
|
|
}
|
|
|
|
function joinTokens(...args) {
|
|
let a = args.map((i) => escapeRegExp(i));
|
|
let r = new RegExp('^' + a.join('\\s*') + '$');
|
|
return r;
|
|
}
|
|
|
|
/**
|
|
* @description
|
|
* Returns an array of objects from the PDF data. Object items are surrounded by /\d 0 obj/ and 'endobj'.
|
|
* @param {PDFData} data
|
|
* @return {Array<PDFDataObject>}
|
|
*/
|
|
function getObjects(data) {
|
|
const objects = [];
|
|
let currentObject = null;
|
|
for (const item of data) {
|
|
if (item instanceof Buffer) {
|
|
if (currentObject) {
|
|
currentObject.items.push(item);
|
|
}
|
|
} else if (typeof item === 'string') {
|
|
if (/^\d+\s0\sobj/.test(item)) {
|
|
currentObject = { items: [] };
|
|
objects.push(currentObject);
|
|
} else if (item === 'endobj') {
|
|
currentObject = null;
|
|
} else if (currentObject) {
|
|
currentObject.items.push(item);
|
|
}
|
|
}
|
|
}
|
|
return objects;
|
|
}
|
|
|
|
/**
|
|
* @param {Buffer} textStream
|
|
* @return {TextStream | undefined}
|
|
*/
|
|
function parseTextStream(textStream) {
|
|
const decodedStream = textStream.toString('utf8');
|
|
|
|
// Extract font and font size
|
|
const fontMatch = decodedStream.match(/\/([A-Za-z0-9]+)\s+(\d+)\s+Tf/);
|
|
|
|
if (!fontMatch) {
|
|
return undefined;
|
|
}
|
|
|
|
const font = fontMatch[1];
|
|
const fontSize = parseInt(fontMatch[2], 10);
|
|
|
|
// Extract hex strings inside TJ array
|
|
const tjMatch = decodedStream.match(/\[([^\]]+)\]\s+TJ/);
|
|
if (!tjMatch) {
|
|
return undefined;
|
|
}
|
|
let text = '';
|
|
|
|
// this is a simplified version
|
|
// the correct way is to retrieve the encoding from /Resources /Font dictionary and decode using it
|
|
// https://stackoverflow.com/a/29468049/5724645
|
|
|
|
// Match all hex strings like <...>
|
|
const hexMatches = [...tjMatch[1].matchAll(/<([0-9a-fA-F]+)>/g)];
|
|
for (const m of hexMatches) {
|
|
// Convert hex to string
|
|
const hex = m[1];
|
|
for (let i = 0; i < hex.length; i += 2) {
|
|
const code = parseInt(hex.substr(i, 2), 16);
|
|
let char = String.fromCharCode(code);
|
|
// Handle special cases
|
|
if (code === 0x0a) {
|
|
char = '\n'; // Newline
|
|
} else if (code === 0x0d) {
|
|
char = '\r'; // Carriage return
|
|
} else if (code === 0x85) {
|
|
char = '...';
|
|
}
|
|
text += char;
|
|
}
|
|
}
|
|
|
|
return { text, font, fontSize };
|
|
}
|
|
|
|
export { logData, joinTokens, parseTextStream, getObjects };
|