Skip to content

Commit

Permalink
Ignore comments when parsing (#49)
Browse files Browse the repository at this point in the history
* Fix xref trailer Size bug

* Remove comments between indirect objects

* Update trimArrayAndRemoveComments tests

* Handle comments in PDFArrays

* Handle comments when parsing PDFBooleans

* Handle comments when parsing PDFDictionaries

* Handle comments when parsing PDFHexStrings

* Handle comments when parsing PDFIndirectObjects

* Handle carriage returns EOL markers in comments

* Handle comments when parsing PDFIndirectReference objects

* Handle comments when parsing PDFName objects

* Handle comments when parsing PDFNull objects

* Handle comments when parsing PDFNumber objects

* Handle comments when parsing PDFStream and PDFObjectStream objects

* Handle comments when parsing PDFString objects

* Handle comments when parsing PDFTrailer objects

* Add more PDF comments to parseDict and parseArray tests

* Run linter

* Reset accidental changes to integration tests
  • Loading branch information
Hopding committed Nov 23, 2018
1 parent 6de1293 commit 2a1d000
Show file tree
Hide file tree
Showing 28 changed files with 360 additions and 49 deletions.
38 changes: 37 additions & 1 deletion __tests__/core/pdf-parser/parseArray.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ describe(`parseArray`, () => {
expect(res[1]).toEqual(typedArrayFor('<< /Key /Val >>'));
});

it(`allows leading comments before the PDF Array object`, () => {
const input = typedArrayFor('% This is a % comment\n [(foo)] \n');
const res = parseArray(input, PDFObjectIndex.create());
expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);
expect(res[0].array).toEqual([expect.any(PDFString)]);
expect(res[1]).toEqual(typedArrayFor(''));
});

it(`allows comments before the PDF Array object's closing bracket`, () => {
const input = typedArrayFor('[(foo)% This is a comment!\n]');
const res = parseArray(input, PDFObjectIndex.create());
expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);
expect(res[0].array).toEqual([expect.any(PDFString)]);
expect(res[1]).toEqual(typedArrayFor(''));
});

it(`parses nested PDF Arrays`, () => {
const input = typedArrayFor('[[[]]]');
const res = parseArray(input, PDFObjectIndex.create());
Expand All @@ -75,7 +91,27 @@ describe(`parseArray`, () => {
PDF Null
]`, () => {
const input = typedArrayFor(
'[/Foo << /Key /Val >> [] (Bar) 21 0 R 0.56 <ABC123> true null]',
`[
% Comment
/Foo % Comment
% Comment
<< /Key /Val >> % Comment
% Comment
[] % Comment
% Comment
(Bar) % Comment
% Comment
21 0 R % Comment
% Comment
0.56 % Comment
% Comment
<ABC123> % Comment
% Comment
true % Comment
% Comment
null % Comment
% Comment
]`,
);
const res = parseArray(input, PDFObjectIndex.create());
expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);
Expand Down
8 changes: 8 additions & 0 deletions __tests__/core/pdf-parser/parseBool.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ describe(`parseBool`, () => {
expect(res[1]).toEqual(typedArrayFor('FOOBAR'));
});

it(`allows leading comments before the PDFBoolean object`, () => {
const input = typedArrayFor('\n% This is a % comment!\ntrue% Another one!');
const res = parseBool(input);
expect(res).toEqual([expect.any(PDFBoolean), expect.any(Uint8Array)]);
expect(res[0].boolean).toEqual(true);
expect(res[1]).toEqual(typedArrayFor('% Another one!'));
});

it(`returns undefined when leading input is not a PDFBoolean`, () => {
const input = typedArrayFor('FOOBARtrue');
const res = parseBool(input);
Expand Down
66 changes: 56 additions & 10 deletions __tests__/core/pdf-parser/parseDict.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,30 @@ describe(`parseDict`, () => {
it(`allows leading whitespace and line endings before & after the PDF Dictionary object`, () => {
const input = typedArrayFor(' \n \r\n << /Foo /Bar >> \r\n [(foo)]');
const res = parseDict(input, PDFObjectIndex.create());
expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);

expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
expect(res[1]).toEqual(typedArrayFor('[(foo)]'));
});

it(`handles leading comments before the PDFDictionary object`, () => {
const input = typedArrayFor('% This is a comment!\n<< /Foo /Bar >>% Stuff');
const res = parseDict(input, PDFObjectIndex.create());

expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
expect(res[1]).toEqual(typedArrayFor('% Stuff'));
});

it(`handles comments before the PDFDictionary object's closing brackets`, () => {
const input = typedArrayFor('<< /Foo /Bar % Stuff\n >>');
const res = parseDict(input, PDFObjectIndex.create());

expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
expect(res[1]).toEqual(typedArrayFor(''));
});

it(`parses nested PDF Dictionaries`, () => {
const input = typedArrayFor(
'<< /First << /Second << /Third (Foo) >> >> >>',
Expand Down Expand Up @@ -93,15 +111,43 @@ describe(`parseDict`, () => {
]`, () => {
const input = typedArrayFor(`
<<
/PDFName /Foo
/PDFDictionary << /Key /Val >>
/PDFArray [1 (2)]
/PDFString (Look, a string!)
/PDFIndirectReference 21 0 R
/PDFNumber -.123
/PDFHexString <ABC123>
/PDFBoolean true
/PDFNull null
% Entry 1
/PDFName % Key
/Foo % Value
% Entry 2
/PDFDictionary % Key
<< /Key /Val >> % Value
% Entry 3
/PDFArray % Key
[1 (2)] % Value
% Entry 4
/PDFString % Key
(Look, a string!) % Value
% Entry 5
/PDFIndirectReference % Key
21 0 R % Value
% Entry 6
/PDFNumber % Key
-.123 % Value
% Entry 7
/PDFHexString % Key
<ABC123> % Value
% Entry 8
/PDFBoolean % Key
true % Value
% Entry 9
/PDFNull % Key
null % Value
% End
>>
`);
const res = parseDict(input, PDFObjectIndex.create());
Expand Down
9 changes: 9 additions & 0 deletions __tests__/core/pdf-parser/parseHexString.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,13 @@ describe(`parseHexString`, () => {
const res = parseHexString(input);
expect(res).toBeUndefined();
});

it(`handles leading comments before the PDFHexString object`, () => {
const input = typedArrayFor('\u0000% This is a comment!\n<ABC123>');
const res = parseHexString(input);

expect(res).toEqual([expect.any(PDFHexString), expect.any(Uint8Array)]);
expect(res[0].string).toEqual('ABC123');
expect(res[1]).toEqual(typedArrayFor(''));
});
});
42 changes: 42 additions & 0 deletions __tests__/core/pdf-parser/parseIndirectObj.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,46 @@ describe(`parseIndirectObj`, () => {
const res = parseIndirectObj(input, PDFObjectIndex.create());
expect(res).toBeUndefined();
});

it(`handles leading comments before the PDFIndirectObject`, () => {
const input = typedArrayFor(
`% This is a comment!\n0 1 obj\n(I'm a little teapot)\nendobj`,
);
const res = parseIndirectObj(input, PDFObjectIndex.create());
expect(res).toEqual([
expect.any(PDFIndirectObject),
expect.any(Uint8Array),
]);
expect(res[0].pdfObject).toEqual(expect.any(PDFString));
expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
});

it(`handles comments after the reference numbers of the PDFIndirectObject`, () => {
const input = typedArrayFor(
`0 1 obj\n% This is a comment!\n(I'm a little teapot)\nendobj`,
);
const res = parseIndirectObj(input, PDFObjectIndex.create());
expect(res).toEqual([
expect.any(PDFIndirectObject),
expect.any(Uint8Array),
]);
expect(res[0].pdfObject).toEqual(expect.any(PDFString));
expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
});

it(`handles comments before the "endobj" keyword of the PDFIndirectObject`, () => {
const input = typedArrayFor(
`0 1 obj\n(I'm a little teapot)\n% This is a comment!\nendobj`,
);
const res = parseIndirectObj(input, PDFObjectIndex.create());
expect(res).toEqual([
expect.any(PDFIndirectObject),
expect.any(Uint8Array),
]);
expect(res[0].pdfObject).toEqual(expect.any(PDFString));
expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
});
});
11 changes: 11 additions & 0 deletions __tests__/core/pdf-parser/parseIndirectRef.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,15 @@ describe(`parseIndirectRef`, () => {
const res = parseIndirectRef(input);
expect(res).toBeUndefined();
});

it(`handles leading comments before the PDFIndirectReference object`, () => {
const input = typedArrayFor(`% This is a comment!\r1 1 RFoo`);
const res = parseIndirectRef(input);
expect(res).toEqual([
expect.any(PDFIndirectReference),
expect.any(Uint8Array),
]);
expect(res[0]).toEqual(PDFIndirectReference.forNumbers(1, 1));
expect(res[1]).toEqual(typedArrayFor(`Foo`));
});
});
8 changes: 8 additions & 0 deletions __tests__/core/pdf-parser/parseName.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,12 @@ describe(`parseName`, () => {
});
});
});

it(`handles leading comments before the PDFName object`, () => {
const input = typedArrayFor(
'% This is a comment\n% And so is this!\r\n/Foo',
);
const res = parseName(input);
expect(res).toEqual([PDFName.from('Foo'), typedArrayFor('')]);
});
});
6 changes: 6 additions & 0 deletions __tests__/core/pdf-parser/parseNull.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,10 @@ describe(`parseNull`, () => {
const res = parseNull(input);
expect(res).toEqual([PDFNull.instance, typedArrayFor(' \r\n (foo)')]);
});

it(`handles leading comments before the PDF Null object`, () => {
const input = typedArrayFor('% This is a comment!\r\n null \r\n (foo)');
const res = parseNull(input);
expect(res).toEqual([PDFNull.instance, typedArrayFor(' \r\n (foo)')]);
});
});
8 changes: 8 additions & 0 deletions __tests__/core/pdf-parser/parseNumber.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,12 @@ describe(`parseNumber`, () => {
expect(res[0].number).toEqual(0.123);
expect(res[1]).toEqual(typedArrayFor('-.123'));
});

it(`handles leading comments before the PDFNumber object`, () => {
const input = typedArrayFor('% This is a comment!\r+.123-.123');
const res = parseNumber(input);
expect(res).toEqual([expect.any(PDFNumber), expect.any(Uint8Array)]);
expect(res[0].number).toEqual(0.123);
expect(res[1]).toEqual(typedArrayFor('-.123'));
});
});
23 changes: 23 additions & 0 deletions __tests__/core/pdf-parser/parseStream.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,17 @@ describe(`parseStream`, () => {
typedArrayFor(`endobjstream\n...OTHER STUFF...\nendstreamendobj`),
);
});

it(`handles leading comments before the PDFStream object`, () => {
const input = typedArrayFor(
`% This is a comment\rstream\n...STUFF AND THINGZ...\nendstreamendobj`,
);
const index = PDFObjectIndex.create();
const res = parseStream(input, PDFDictionary.from({}, index), index);
expect(res).toEqual([expect.any(PDFRawStream), expect.any(Uint8Array)]);
expect(res[0].content).toEqual(typedArrayFor(`...STUFF AND THINGZ...`));
expect(res[1]).toEqual(typedArrayFor('endobj'));
});
});

describe(`when parsing "object" streams`, () => {
Expand Down Expand Up @@ -177,5 +188,17 @@ describe(`parseStream`, () => {
);
expect(() => parseStream(input, errDict, index)).toThrowError();
});

it(`handles leading comments before the PDFObjectStream object`, () => {
const commentedInput = mergeUint8Arrays(
typedArrayFor('% This is a comment!\r'),
input,
);
const res = parseStream(commentedInput, dict, index);
expect(res).toEqual([
expect.any(PDFObjectStream),
expect.any(Uint8Array),
]);
});
});
});
8 changes: 8 additions & 0 deletions __tests__/core/pdf-parser/parseString.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,12 @@ describe(`parseString`, () => {
const res = parseString(input);
expect(res).toBeUndefined();
});

it(`handles leading comments before the PDFString object`, () => {
const input = typedArrayFor('% This is a comment!\r(FOO%Bar\n)');
const res = parseString(input);
expect(res).toEqual([expect.any(PDFString), expect.any(Uint8Array)]);
expect(res[0].string).toEqual('FOO%Bar\n');
expect(res[1]).toEqual(typedArrayFor(''));
});
});
21 changes: 20 additions & 1 deletion __tests__/core/pdf-parser/parseTrailer.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
parseTrailerWithoutDict,
} from 'core/pdf-parser/parseTrailer';
import { PDFTrailer } from 'core/pdf-structures';
import { typedArrayFor } from 'utils';
import { arrayToString, typedArrayFor } from 'utils';

describe(`parseTrailer`, () => {
it(`parses a single PDF Trailer object from its input array`, () => {
Expand Down Expand Up @@ -114,4 +114,23 @@ describe(`parseTrailerWithoutDict`, () => {
expect.any(PDFTrailer),
);
});

it(`handles leading comments before the PDFTrailer object`, () => {
const input = typedArrayFor(`
% This is a comment!
trailer
<< /Root 1 0 R /Size 5 >>
startxref
565
%%EOF
`);
const res = parseTrailer(input, PDFObjectIndex.create());
expect(res).toEqual([expect.any(PDFTrailer), expect.any(Uint8Array)]);
expect(res[0].offset).toEqual(565);
expect(res[0].dictionary).toEqual(expect.any(PDFDictionary));
expect(res[0].dictionary.get('Root')).toEqual(
PDFIndirectReference.forNumbers(1, 0),
);
expect(res[1]).toEqual(typedArrayFor('\n '));
});
});
47 changes: 47 additions & 0 deletions __tests__/utils/index.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import {
arrayToString,
trimArrayAndRemoveComments,
typedArrayFor,
} from 'utils';

describe(`trimArrayAndRemoveComments`, () => {
it(`removes leading PDF comments from its input`, () => {
const input = typedArrayFor(
'% I am a comment!\n%I am a comment too!\nThis is not a comment. \n ',
);
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
});

it(`removes leading whitespace and PDF comments from its input`, () => {
const input = typedArrayFor(
' \n %I am a comment too!\nThis is not a comment. \n ',
);
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
});

it(`removes leading whitespace from its input`, () => {
const input = typedArrayFor(' \n \nThis is not a comment. \n ');
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
});

it(`returns its input when there are no leading PDF comments or whitespace`, () => {
const input = typedArrayFor('This is not a comment. \n ');
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
});

it(`returns its input when the comment's newline is missing`, () => {
const input = typedArrayFor('% This is not a complete comment');
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('% This is not a complete comment'));
});

it(`handles "\\n" and "\\r" EOL markers`, () => {
const input = typedArrayFor('% First\n%Second\r% Third\r\nFoo');
const res = trimArrayAndRemoveComments(input);
expect(res).toEqual(typedArrayFor('Foo'));
});
});
Loading

0 comments on commit 2a1d000

Please sign in to comment.