Generate ToUnicodeMap bfrange in multiple ranges (#1498) (#1499)

* Generate ToUnicodeMap bfrange in multiple ranges (#1498)

This resolves #1498.

* Add unit test for bfrange lines in toUnicodeMap

* Add changelog line
This commit is contained in:
Yeechan Lu 2024-02-26 21:18:47 +08:00 committed by GitHub
parent 485b7e6bee
commit 946f9cf6dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 2 deletions

View File

@ -7,6 +7,7 @@
- Fix for soft hyphen not being replaced by visible hyphen if necessary (#457)
- Optimize output files by ignoring identity transforms
- Fix for Acroforms - setting an option to false will still apply the flag (#1495)
- Fix for text extraction in PDFium-based viewers due to invalid ToUnicodeMap (#1498)
### [v0.14.0] - 2023-11-09

View File

@ -252,6 +252,15 @@ class EmbeddedFont extends PDFFont {
entries.push(`<${encoded.join(' ')}>`);
}
const chunkSize = 256;
const chunks = Math.ceil(entries.length / chunkSize);
const ranges = [];
for (let i = 0; i < chunks; i++) {
const start = i * chunkSize;
const end = Math.min((i + 1) * chunkSize, entries.length);
ranges.push(`<${toHex(start)}> <${toHex(end - 1)}> [${entries.slice(start, end).join(' ')}]`);
}
cmap.end(`\
/CIDInit /ProcSet findresource begin
12 dict begin
@ -267,7 +276,7 @@ begincmap
<0000><ffff>
endcodespacerange
1 beginbfrange
<0000> <${toHex(entries.length - 1)}> [${entries.join(' ')}]
${ranges.join('\n')}
endbfrange
endcmap
CMapName currentdict /CMap defineresource pop

View File

@ -1,5 +1,6 @@
import PDFFontFactory from '../../lib/font_factory';
import PDFDocument from '../../lib/document';
import PDFFontFactory from '../../lib/font_factory';
import { logData } from './helpers';
describe('EmbeddedFont', () => {
test('no fontLayoutCache option', () => {
@ -52,4 +53,46 @@ describe('EmbeddedFont', () => {
expect(dictionary.data.BaseFont).toBe('BAJJZZ+Roboto-Regular');
});
});
describe.only('toUnicodeMap', () => {
test('bfrange lines should not cross highcode boundary', () => {
const doc = new PDFDocument({ compress: false });
const font = PDFFontFactory.open(
doc,
'tests/fonts/Roboto-Regular.ttf',
undefined,
'F1099'
);
// 398 different glyphs
font.encode('ABCDEFGHIJKLMNOPQRSTUVWXYZ');
font.encode('abcdefghijklmnopqrstuvwxyz');
font.encode('ÁÀÂÄÅÃÆÇÐÉÈÊËÍÌÎÏÑÓÒÔÖÕØŒÞÚÙÛÜÝŸ');
font.encode('áàâäãåæçðéèêëíìîïıñóòôöõøœßþúùûüýÿ');
font.encode('ĀĂĄĆČĎĐĒĖĘĚĞĢĪĮİĶŁĹĻĽŃŅŇŌŐŔŖŘŠŚŞȘŢȚŤŪŮŰŲŽŹŻ');
font.encode('āăąćčďđēėęěğģīįķłĺļľńņňōőŕŗřšśşșţțťūůűųžźż');
font.encode('ΑΒΓ∆ΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΆΈΉΊΌΎΏΪΫ');
font.encode('αβγδεζηθικλµνξοπρςστυφχψωάέήίόύώϊϋΐΰ');
font.encode('АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ');
font.encode('абвгдежзийклмнопрстуфхцчшщъыьэюя');
font.encode('ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏҐӁҒҖҚҢҮҰҲҶҺӘӢӨӮ');
font.encode('ѐёђѓєѕіїјљњћќѝўџґӂғҗқңүұҳҷһәӣөӯ');
const docData = logData(doc);
font.toUnicodeCmap();
const text = docData.map((d) => d.toString("utf8")).join("");
let glyphs = 0
for (const block of text.matchAll(/beginbfrange\n((?:.|\n)*?)\nendbfrange/g)) {
for (const line of block[1].matchAll(/^<([0-9a-f]+)>\s+<([0-9a-f]+)>\s+\[/igm)) {
const low = parseInt(line[1], 16);
const high = parseInt(line[2], 16);
glyphs += high - low + 1;
expect(high & 0xFFFFFF00).toBe(low & 0xFFFFFF00);
}
}
expect(glyphs).toBe(398 + 1);
});
});
});