Ignore comments when parsing (#49)

* Fix xref trailer Size bug * Remove comments between indirect objects * Update trimArrayAndRemoveComments tests * Handle comments in PDFArrays * Handle comments when parsing PDFBooleans * Handle comments when parsing PDFDictionaries * Handle comments when parsing PDFHexStrings * Handle comments when parsing PDFIndirectObjects * Handle carriage returns EOL markers in comments * Handle comments when parsing PDFIndirectReference objects * Handle comments when parsing PDFName objects * Handle comments when parsing PDFNull objects * Handle comments when parsing PDFNumber objects * Handle comments when parsing PDFStream and PDFObjectStream objects * Handle comments when parsing PDFString objects * Handle comments when parsing PDFTrailer objects * Add more PDF comments to parseDict and parseArray tests * Run linter * Reset accidental changes to integration tests
Hopding · Nov 23, 2018 · 2a1d000 · 2a1d000
1 parent 6de1293
commit 2a1d000
Show file tree

Hide file tree

Showing 28 changed files with 360 additions and 49 deletions.
diff --git a/__tests__/core/pdf-parser/parseArray.spec.ts b/__tests__/core/pdf-parser/parseArray.spec.ts
@@ -50,6 +50,22 @@ describe(`parseArray`, () => {
     expect(res[1]).toEqual(typedArrayFor('<< /Key /Val >>'));
   });
 
+  it(`allows leading comments before the PDF Array object`, () => {
+    const input = typedArrayFor('% This is a % comment\n [(foo)] \n');
+    const res = parseArray(input, PDFObjectIndex.create());
+    expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);
+    expect(res[0].array).toEqual([expect.any(PDFString)]);
+    expect(res[1]).toEqual(typedArrayFor(''));
+  });
+
+  it(`allows comments before the PDF Array object's closing bracket`, () => {
+    const input = typedArrayFor('[(foo)% This is a comment!\n]');
+    const res = parseArray(input, PDFObjectIndex.create());
+    expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);
+    expect(res[0].array).toEqual([expect.any(PDFString)]);
+    expect(res[1]).toEqual(typedArrayFor(''));
+  });
+
   it(`parses nested PDF Arrays`, () => {
     const input = typedArrayFor('[[[]]]');
     const res = parseArray(input, PDFObjectIndex.create());
@@ -75,7 +91,27 @@ describe(`parseArray`, () => {
       PDF Null
     ]`, () => {
     const input = typedArrayFor(
-      '[/Foo << /Key /Val >> [] (Bar) 21 0 R 0.56 <ABC123> true null]',
+      `[
+        % Comment
+        /Foo % Comment
+        % Comment
+        << /Key /Val >> % Comment
+        % Comment
+        [] % Comment
+        % Comment
+        (Bar) % Comment
+        % Comment
+        21 0 R % Comment
+        % Comment
+        0.56 % Comment
+        % Comment
+        <ABC123> % Comment
+        % Comment
+        true % Comment
+        % Comment
+        null % Comment
+        % Comment
+      ]`,
     );
     const res = parseArray(input, PDFObjectIndex.create());
     expect(res).toEqual([expect.any(PDFArray), expect.any(Uint8Array)]);

diff --git a/__tests__/core/pdf-parser/parseBool.spec.ts b/__tests__/core/pdf-parser/parseBool.spec.ts
@@ -19,6 +19,14 @@ describe(`parseBool`, () => {
     expect(res[1]).toEqual(typedArrayFor('FOOBAR'));
   });
 
+  it(`allows leading comments before the PDFBoolean object`, () => {
+    const input = typedArrayFor('\n% This is a % comment!\ntrue% Another one!');
+    const res = parseBool(input);
+    expect(res).toEqual([expect.any(PDFBoolean), expect.any(Uint8Array)]);
+    expect(res[0].boolean).toEqual(true);
+    expect(res[1]).toEqual(typedArrayFor('% Another one!'));
+  });
+
   it(`returns undefined when leading input is not a PDFBoolean`, () => {
     const input = typedArrayFor('FOOBARtrue');
     const res = parseBool(input);

diff --git a/__tests__/core/pdf-parser/parseDict.spec.ts b/__tests__/core/pdf-parser/parseDict.spec.ts
@@ -51,12 +51,30 @@ describe(`parseDict`, () => {
   it(`allows leading whitespace and line endings before & after the PDF Dictionary object`, () => {
     const input = typedArrayFor(' \n \r\n << /Foo /Bar >> \r\n [(foo)]');
     const res = parseDict(input, PDFObjectIndex.create());
-    expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
 
+    expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
     expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
     expect(res[1]).toEqual(typedArrayFor('[(foo)]'));
   });
 
+  it(`handles leading comments before the PDFDictionary object`, () => {
+    const input = typedArrayFor('% This is a comment!\n<< /Foo /Bar >>% Stuff');
+    const res = parseDict(input, PDFObjectIndex.create());
+
+    expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
+    expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
+    expect(res[1]).toEqual(typedArrayFor('% Stuff'));
+  });
+
+  it(`handles comments before the PDFDictionary object's closing brackets`, () => {
+    const input = typedArrayFor('<< /Foo /Bar % Stuff\n >>');
+    const res = parseDict(input, PDFObjectIndex.create());
+
+    expect(res).toEqual([expect.any(PDFDictionary), expect.any(Uint8Array)]);
+    expect(res[0].get('Foo')).toBe(PDFName.from('Bar'));
+    expect(res[1]).toEqual(typedArrayFor(''));
+  });
+
   it(`parses nested PDF Dictionaries`, () => {
     const input = typedArrayFor(
       '<< /First << /Second << /Third (Foo) >> >> >>',
@@ -93,15 +111,43 @@ describe(`parseDict`, () => {
     ]`, () => {
     const input = typedArrayFor(`
       <<
-        /PDFName /Foo
-        /PDFDictionary << /Key /Val >>
-        /PDFArray [1 (2)]
-        /PDFString (Look, a string!)
-        /PDFIndirectReference 21 0 R
-        /PDFNumber -.123
-        /PDFHexString <ABC123>
-        /PDFBoolean true
-        /PDFNull null
+        % Entry 1
+        /PDFName % Key
+        /Foo     % Value
+
+        % Entry 2
+        /PDFDictionary  % Key
+        << /Key /Val >> % Value
+
+        % Entry 3
+        /PDFArray % Key
+        [1 (2)]   % Value
+
+        % Entry 4
+        /PDFString        % Key
+        (Look, a string!) % Value
+
+        % Entry 5
+        /PDFIndirectReference % Key
+        21 0 R                % Value
+
+        % Entry 6
+        /PDFNumber % Key
+        -.123      % Value
+
+        % Entry 7
+        /PDFHexString % Key
+        <ABC123>      % Value
+
+        % Entry 8
+        /PDFBoolean % Key
+        true        % Value
+
+        % Entry 9
+        /PDFNull % Key
+        null     % Value
+
+        % End
       >>
     `);
     const res = parseDict(input, PDFObjectIndex.create());

diff --git a/__tests__/core/pdf-parser/parseHexString.spec.ts b/__tests__/core/pdf-parser/parseHexString.spec.ts
@@ -43,4 +43,13 @@ describe(`parseHexString`, () => {
     const res = parseHexString(input);
     expect(res).toBeUndefined();
   });
+
+  it(`handles leading comments before the PDFHexString object`, () => {
+    const input = typedArrayFor('\u0000% This is a comment!\n<ABC123>');
+    const res = parseHexString(input);
+
+    expect(res).toEqual([expect.any(PDFHexString), expect.any(Uint8Array)]);
+    expect(res[0].string).toEqual('ABC123');
+    expect(res[1]).toEqual(typedArrayFor(''));
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseIndirectObj.spec.ts b/__tests__/core/pdf-parser/parseIndirectObj.spec.ts
@@ -153,4 +153,46 @@ describe(`parseIndirectObj`, () => {
     const res = parseIndirectObj(input, PDFObjectIndex.create());
     expect(res).toBeUndefined();
   });
+
+  it(`handles leading comments before the PDFIndirectObject`, () => {
+    const input = typedArrayFor(
+      `% This is a comment!\n0 1 obj\n(I'm a little teapot)\nendobj`,
+    );
+    const res = parseIndirectObj(input, PDFObjectIndex.create());
+    expect(res).toEqual([
+      expect.any(PDFIndirectObject),
+      expect.any(Uint8Array),
+    ]);
+    expect(res[0].pdfObject).toEqual(expect.any(PDFString));
+    expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
+    expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
+  });
+
+  it(`handles comments after the reference numbers of the PDFIndirectObject`, () => {
+    const input = typedArrayFor(
+      `0 1 obj\n% This is a comment!\n(I'm a little teapot)\nendobj`,
+    );
+    const res = parseIndirectObj(input, PDFObjectIndex.create());
+    expect(res).toEqual([
+      expect.any(PDFIndirectObject),
+      expect.any(Uint8Array),
+    ]);
+    expect(res[0].pdfObject).toEqual(expect.any(PDFString));
+    expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
+    expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
+  });
+
+  it(`handles comments before the "endobj" keyword of the PDFIndirectObject`, () => {
+    const input = typedArrayFor(
+      `0 1 obj\n(I'm a little teapot)\n% This is a comment!\nendobj`,
+    );
+    const res = parseIndirectObj(input, PDFObjectIndex.create());
+    expect(res).toEqual([
+      expect.any(PDFIndirectObject),
+      expect.any(Uint8Array),
+    ]);
+    expect(res[0].pdfObject).toEqual(expect.any(PDFString));
+    expect(res[0].pdfObject.string).toEqual(`I'm a little teapot`);
+    expect(res[0].reference).toEqual(PDFIndirectReference.forNumbers(0, 1));
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseIndirectRef.spec.ts b/__tests__/core/pdf-parser/parseIndirectRef.spec.ts
@@ -59,4 +59,15 @@ describe(`parseIndirectRef`, () => {
     const res = parseIndirectRef(input);
     expect(res).toBeUndefined();
   });
+
+  it(`handles leading comments before the PDFIndirectReference object`, () => {
+    const input = typedArrayFor(`% This is a comment!\r1 1 RFoo`);
+    const res = parseIndirectRef(input);
+    expect(res).toEqual([
+      expect.any(PDFIndirectReference),
+      expect.any(Uint8Array),
+    ]);
+    expect(res[0]).toEqual(PDFIndirectReference.forNumbers(1, 1));
+    expect(res[1]).toEqual(typedArrayFor(`Foo`));
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseName.spec.ts b/__tests__/core/pdf-parser/parseName.spec.ts
@@ -87,4 +87,12 @@ describe(`parseName`, () => {
       });
     });
   });
+
+  it(`handles leading comments before the PDFName object`, () => {
+    const input = typedArrayFor(
+      '% This is a comment\n% And so is this!\r\n/Foo',
+    );
+    const res = parseName(input);
+    expect(res).toEqual([PDFName.from('Foo'), typedArrayFor('')]);
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseNull.spec.ts b/__tests__/core/pdf-parser/parseNull.spec.ts
@@ -29,4 +29,10 @@ describe(`parseNull`, () => {
     const res = parseNull(input);
     expect(res).toEqual([PDFNull.instance, typedArrayFor(' \r\n (foo)')]);
   });
+
+  it(`handles leading comments before the PDF Null object`, () => {
+    const input = typedArrayFor('% This is a comment!\r\n null \r\n (foo)');
+    const res = parseNull(input);
+    expect(res).toEqual([PDFNull.instance, typedArrayFor(' \r\n (foo)')]);
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseNumber.spec.ts b/__tests__/core/pdf-parser/parseNumber.spec.ts
@@ -51,4 +51,12 @@ describe(`parseNumber`, () => {
     expect(res[0].number).toEqual(0.123);
     expect(res[1]).toEqual(typedArrayFor('-.123'));
   });
+
+  it(`handles leading comments before the PDFNumber object`, () => {
+    const input = typedArrayFor('% This is a comment!\r+.123-.123');
+    const res = parseNumber(input);
+    expect(res).toEqual([expect.any(PDFNumber), expect.any(Uint8Array)]);
+    expect(res[0].number).toEqual(0.123);
+    expect(res[1]).toEqual(typedArrayFor('-.123'));
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseStream.spec.ts b/__tests__/core/pdf-parser/parseStream.spec.ts
@@ -124,6 +124,17 @@ describe(`parseStream`, () => {
         typedArrayFor(`endobjstream\n...OTHER STUFF...\nendstreamendobj`),
       );
     });
+
+    it(`handles leading comments before the PDFStream object`, () => {
+      const input = typedArrayFor(
+        `% This is a comment\rstream\n...STUFF AND THINGZ...\nendstreamendobj`,
+      );
+      const index = PDFObjectIndex.create();
+      const res = parseStream(input, PDFDictionary.from({}, index), index);
+      expect(res).toEqual([expect.any(PDFRawStream), expect.any(Uint8Array)]);
+      expect(res[0].content).toEqual(typedArrayFor(`...STUFF AND THINGZ...`));
+      expect(res[1]).toEqual(typedArrayFor('endobj'));
+    });
   });
 
   describe(`when parsing "object" streams`, () => {
@@ -177,5 +188,17 @@ describe(`parseStream`, () => {
       );
       expect(() => parseStream(input, errDict, index)).toThrowError();
     });
+
+    it(`handles leading comments before the PDFObjectStream object`, () => {
+      const commentedInput = mergeUint8Arrays(
+        typedArrayFor('% This is a comment!\r'),
+        input,
+      );
+      const res = parseStream(commentedInput, dict, index);
+      expect(res).toEqual([
+        expect.any(PDFObjectStream),
+        expect.any(Uint8Array),
+      ]);
+    });
   });
 });
diff --git a/__tests__/core/pdf-parser/parseString.spec.ts b/__tests__/core/pdf-parser/parseString.spec.ts
@@ -67,4 +67,12 @@ describe(`parseString`, () => {
     const res = parseString(input);
     expect(res).toBeUndefined();
   });
+
+  it(`handles leading comments before the PDFString object`, () => {
+    const input = typedArrayFor('% This is a comment!\r(FOO%Bar\n)');
+    const res = parseString(input);
+    expect(res).toEqual([expect.any(PDFString), expect.any(Uint8Array)]);
+    expect(res[0].string).toEqual('FOO%Bar\n');
+    expect(res[1]).toEqual(typedArrayFor(''));
+  });
 });
diff --git a/__tests__/core/pdf-parser/parseTrailer.spec.ts b/__tests__/core/pdf-parser/parseTrailer.spec.ts
@@ -5,7 +5,7 @@ import {
   parseTrailerWithoutDict,
 } from 'core/pdf-parser/parseTrailer';
 import { PDFTrailer } from 'core/pdf-structures';
-import { typedArrayFor } from 'utils';
+import { arrayToString, typedArrayFor } from 'utils';
 
 describe(`parseTrailer`, () => {
   it(`parses a single PDF Trailer object from its input array`, () => {
@@ -114,4 +114,23 @@ describe(`parseTrailerWithoutDict`, () => {
       expect.any(PDFTrailer),
     );
   });
+
+  it(`handles leading comments before the PDFTrailer object`, () => {
+    const input = typedArrayFor(`
+      % This is a comment!
+      trailer
+      << /Root 1 0 R /Size 5 >>
+      startxref
+      565
+      %%EOF
+    `);
+    const res = parseTrailer(input, PDFObjectIndex.create());
+    expect(res).toEqual([expect.any(PDFTrailer), expect.any(Uint8Array)]);
+    expect(res[0].offset).toEqual(565);
+    expect(res[0].dictionary).toEqual(expect.any(PDFDictionary));
+    expect(res[0].dictionary.get('Root')).toEqual(
+      PDFIndirectReference.forNumbers(1, 0),
+    );
+    expect(res[1]).toEqual(typedArrayFor('\n    '));
+  });
 });
diff --git a/__tests__/utils/index.spec.ts b/__tests__/utils/index.spec.ts
@@ -0,0 +1,47 @@
+import {
+  arrayToString,
+  trimArrayAndRemoveComments,
+  typedArrayFor,
+} from 'utils';
+
+describe(`trimArrayAndRemoveComments`, () => {
+  it(`removes leading PDF comments from its input`, () => {
+    const input = typedArrayFor(
+      '% I am a comment!\n%I am a comment too!\nThis is not a comment. \n ',
+    );
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
+  });
+
+  it(`removes leading whitespace and PDF comments from its input`, () => {
+    const input = typedArrayFor(
+      '   \n  %I am a comment too!\nThis is not a comment. \n ',
+    );
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
+  });
+
+  it(`removes leading whitespace from its input`, () => {
+    const input = typedArrayFor('   \n \nThis is not a comment. \n ');
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
+  });
+
+  it(`returns its input when there are no leading PDF comments or whitespace`, () => {
+    const input = typedArrayFor('This is not a comment. \n ');
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('This is not a comment. \n '));
+  });
+
+  it(`returns its input when the comment's newline is missing`, () => {
+    const input = typedArrayFor('% This is not a complete comment');
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('% This is not a complete comment'));
+  });
+
+  it(`handles "\\n" and "\\r" EOL markers`, () => {
+    const input = typedArrayFor('% First\n%Second\r% Third\r\nFoo');
+    const res = trimArrayAndRemoveComments(input);
+    expect(res).toEqual(typedArrayFor('Foo'));
+  });
+});