Skip to content

Commit

Permalink
- fix issues with stream being trucated
Browse files Browse the repository at this point in the history
  • Loading branch information
Tiefseetauchner committed Sep 14, 2024
1 parent 2f56b5b commit a14425e
Show file tree
Hide file tree
Showing 14 changed files with 100 additions and 72 deletions.
30 changes: 30 additions & 0 deletions Pdfer/EncodingDetector.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace Pdfer;

public class EncodingDetector : IEncodingDetector
{
public async Task<Encoding> DetectEncoding(Stream stream)
{
var bytes = new byte[3];

if (await stream.ReadAsync(bytes) < 3)
throw new IOException("Unexpected end of stream");

if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
return Encoding.UTF8;
if (bytes[0] == 0xFE && bytes[1] == 0xFF)
return Encoding.BigEndianUnicode;
if (bytes[0] == 0xFF && bytes[1] == 0xFE)
return Encoding.Unicode;

return Encoding.Default;
}
}

public interface IEncodingDetector
{
Task<Encoding> DetectEncoding(Stream stream);
}
2 changes: 1 addition & 1 deletion Pdfer/ObjectIdentifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public static ObjectIdentifier ParseReference(string objectIdentifier)
}

public byte[] GetHeaderBytes() =>
Encoding.ASCII.GetBytes(GetHeaderString());
Encoding.UTF8.GetBytes(GetHeaderString());

public string GetHeaderString() =>
$"{ObjectNumber} {Generation} obj\n";
Expand Down
13 changes: 4 additions & 9 deletions Pdfer/Objects/ArrayObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@ namespace Pdfer.Objects;

public class ArrayObjectSerializer(IPdfArrayHelper pdfArrayHelper) : IDocumentObjectSerializer<ArrayObject>
{
public async Task<byte[]> Serialize(ArrayObject documentObject)
public async Task Serialize(Stream stream, ArrayObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfArrayHelper.WriteArray(memoryStream, documentObject.Value);
await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();

await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfArrayHelper.WriteArray(stream, documentObject.Value);
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
12 changes: 4 additions & 8 deletions Pdfer/Objects/DictionaryObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,10 @@ namespace Pdfer.Objects;

public class DictionaryObjectSerializer(IPdfDictionaryHelper pdfDictionaryHelper) : IDocumentObjectSerializer<DictionaryObject>
{
public async Task<byte[]> Serialize(DictionaryObject documentObject)
public async Task Serialize(Stream stream, DictionaryObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfDictionaryHelper.WriteDictionary(memoryStream, documentObject.Value);
await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();
await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfDictionaryHelper.WriteDictionary(stream, documentObject.Value);
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
3 changes: 2 additions & 1 deletion Pdfer/Objects/IDocumentObjectSerializer.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
using System.IO;
using System.Threading.Tasks;

namespace Pdfer.Objects;

public interface IDocumentObjectSerializer<T> where T : DocumentObject
{
Task<byte[]> Serialize(T documentObject);
Task Serialize(Stream stream, T documentObject);
}
12 changes: 4 additions & 8 deletions Pdfer/Objects/NameObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,10 @@ namespace Pdfer.Objects;

public class NameObjectSerializer : IDocumentObjectSerializer<NameObject>
{
public async Task<byte[]> Serialize(NameObject documentObject)
public async Task Serialize(Stream stream, NameObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await memoryStream.WriteAsync(Encoding.ASCII.GetBytes(documentObject.Value));
await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();
await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await stream.WriteAsync(Encoding.UTF8.GetBytes(documentObject.Value));
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
14 changes: 5 additions & 9 deletions Pdfer/Objects/NumberObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,20 @@ namespace Pdfer.Objects;

public class NumberObjectSerializer() : IDocumentObjectSerializer<NumberObject>
{
public async Task<byte[]> Serialize(NumberObject documentObject)
public async Task Serialize(Stream stream, NumberObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());

switch (documentObject)
{
case FloatObject floatObject:
await memoryStream.WriteAsync(Encoding.ASCII.GetBytes(floatObject.Value.ToString(CultureInfo.InvariantCulture)));
await stream.WriteAsync(Encoding.UTF8.GetBytes(floatObject.Value.ToString(CultureInfo.InvariantCulture)));
break;
case IntegerObject integerObject:
await memoryStream.WriteAsync(Encoding.ASCII.GetBytes(integerObject.Value.ToString()));
await stream.WriteAsync(Encoding.UTF8.GetBytes(integerObject.Value.ToString()));
break;
}

await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
2 changes: 1 addition & 1 deletion Pdfer/Objects/PdfObjectReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public async Task<DocumentObject> Read(Stream stream, XRefEntry xRefEntry, Objec

var buffer = new byte[7];
_ = await stream.ReadAsync(buffer);
var contentAfterDictionary = Encoding.ASCII.GetString(buffer);
var contentAfterDictionary = Encoding.UTF8.GetString(buffer);
stream.Position = streamPositionAfterObjectStart;

if (contentAfterDictionary.Trim().StartsWith("stream"))
Expand Down
3 changes: 3 additions & 0 deletions Pdfer/Objects/StreamObjectReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ public async Task<StreamObject> Read(Stream stream, IObjectRepository objectRepo
?? throw new ArgumentException("Invalid length of stream object");
stream.Position = oldPosition;

await streamHelper.ReadStreamTo("stream", stream);
await streamHelper.SkipWhiteSpaceCharacters(stream);

var buffer = new byte[length];
var bytesRead = await stream.ReadAsync(buffer);

Expand Down
17 changes: 7 additions & 10 deletions Pdfer/Objects/StreamObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@ namespace Pdfer.Objects;

public class StreamObjectSerializer(PdfDictionaryHelper pdfDictionaryHelper) : IDocumentObjectSerializer<StreamObject>
{
public async Task<byte[]> Serialize(StreamObject documentObject)
public async Task Serialize(Stream stream, StreamObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfDictionaryHelper.WriteDictionary(memoryStream, documentObject.Dictionary);
await memoryStream.WriteAsync(documentObject.Value);
await memoryStream.WriteAsync("\nendstream"u8.ToArray());
await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();
await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await pdfDictionaryHelper.WriteDictionary(stream, documentObject.Dictionary);
await stream.WriteAsync("\nstream\n"u8.ToArray());
await stream.WriteAsync(documentObject.Value);
await stream.WriteAsync("\nendstream"u8.ToArray());
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
14 changes: 5 additions & 9 deletions Pdfer/Objects/StringObjectSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,11 @@ namespace Pdfer.Objects;

public class StringObjectSerializer : IDocumentObjectSerializer<StringObject>
{
public async Task<byte[]> Serialize(StringObject documentObject)
public async Task Serialize(Stream stream, StringObject documentObject)
{
using var memoryStream = new MemoryStream();

await memoryStream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await memoryStream.WriteAsync("\n"u8.ToArray());
await memoryStream.WriteAsync(Encoding.UTF8.GetBytes(documentObject.Value));
await memoryStream.WriteAsync("\nendobj"u8.ToArray());

return memoryStream.ToArray();
await stream.WriteAsync(documentObject.ObjectIdentifier.GetHeaderBytes());
await stream.WriteAsync("\n"u8.ToArray());
await stream.WriteAsync(Encoding.UTF8.GetBytes(documentObject.Value));
await stream.WriteAsync("\nendobj"u8.ToArray());
}
}
2 changes: 1 addition & 1 deletion Pdfer/PdfArrayHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public async Task WriteArray(Stream stream, string[] array)

foreach (var value in array)
{
await stream.WriteAsync(Encoding.ASCII.GetBytes(value));
await stream.WriteAsync(Encoding.UTF8.GetBytes(value));
await stream.WriteAsync(" "u8.ToArray());
}

Expand Down
8 changes: 6 additions & 2 deletions Pdfer/PdfDictionaryHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,13 @@ public class PdfDictionaryHelper(IStreamHelper streamHelper) : IPdfDictionaryHel
switch (character)
{
case '<' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
case '[' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
case '(' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
bracketDepth++;
break;
case '>' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
case ']' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
case ')' when bufferStringBuilder.Length == 0 || bufferStringBuilder[^1] != '\\':
bracketDepth--;
break;
}
Expand Down Expand Up @@ -104,9 +108,9 @@ public async Task WriteDictionary(Stream stream, Dictionary<string, string> dict
foreach (var (key, value) in dictionary)
{
await stream.WriteAsync("\n"u8.ToArray());
await stream.WriteAsync(Encoding.ASCII.GetBytes(key));
await stream.WriteAsync(Encoding.UTF8.GetBytes(key));
await stream.WriteAsync(" "u8.ToArray());
await stream.WriteAsync(Encoding.ASCII.GetBytes(value));
await stream.WriteAsync(Encoding.UTF8.GetBytes(value));
}

await stream.WriteAsync(">>"u8.ToArray());
Expand Down
40 changes: 27 additions & 13 deletions Pdfer/PdfDocumentWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,34 @@ private async Task<XRefTable> WriteBody(Stream stream, Body pdfDocumentBody)
return xRefTable;
}

// TODO (lena): Write to stream directly
private async Task WriteObject(Stream stream, DocumentObject value)
{
var bytes = value switch
switch (value)
{
DictionaryObject dictionaryObject => await dictionaryObjectSerializer.Serialize(dictionaryObject),
NumberObject numberObject => await numberObjectSerializer.Serialize(numberObject),
StreamObject streamObject => await streamObjectSerializer.Serialize(streamObject),
StringObject stringObject => await stringObjectSerializer.Serialize(stringObject),
NameObject nameObject => await nameObjectSerializer.Serialize(nameObject),
ArrayObject arrayObject => await arrayObjectSerializer.Serialize(arrayObject),
_ => throw new InvalidOperationException($"Unknown object type '{value.GetType()}'")
};
case DictionaryObject dictionaryObject:
await dictionaryObjectSerializer.Serialize(stream, dictionaryObject);
break;
case NumberObject numberObject:
await numberObjectSerializer.Serialize(stream, numberObject);
break;
case StreamObject streamObject:
await streamObjectSerializer.Serialize(stream, streamObject);
break;
case StringObject stringObject:
await stringObjectSerializer.Serialize(stream, stringObject);
break;
case NameObject nameObject:
await nameObjectSerializer.Serialize(stream, nameObject);
break;
case ArrayObject arrayObject:
await arrayObjectSerializer.Serialize(stream, arrayObject);
break;
default:
throw new InvalidOperationException($"Unknown object type '{value.GetType()}'");
}

await stream.WriteAsync(bytes);
await stream.FlushAsync();
}

private long WriteXrefTable(Stream stream, XRefTable xRefTable)
Expand Down Expand Up @@ -125,10 +139,10 @@ private void WriteXrefTableSection(Stream stream, List<string> xRefTableSection,
if (xRefTableSection.Count == 0)
return;

stream.Write(Encoding.ASCII.GetBytes($"{firstObjectNumberInSection} {xRefTableSection.Count.ToString()}\n"));
stream.Write(Encoding.UTF8.GetBytes($"{firstObjectNumberInSection} {xRefTableSection.Count.ToString()}\n"));

var xRefTableEntries = string.Concat(xRefTableSection);
stream.Write(Encoding.ASCII.GetBytes(xRefTableEntries));
stream.Write(Encoding.UTF8.GetBytes(xRefTableEntries));
}

// TODO (lena): Change Size in Trailer
Expand All @@ -140,7 +154,7 @@ private async Task WriteTrailer(Stream stream, Trailer pdfDocumentTrailer, long

await stream.WriteAsync("\n"u8.ToArray());
await stream.WriteAsync("startxref\n"u8.ToArray());
await stream.WriteAsync(Encoding.ASCII.GetBytes(xRefTableOffset.ToString()));
await stream.WriteAsync(Encoding.UTF8.GetBytes(xRefTableOffset.ToString()));
await stream.WriteAsync("\n"u8.ToArray());
await stream.WriteAsync("%%EOF\n"u8.ToArray());
}
Expand Down

0 comments on commit a14425e

Please sign in to comment.