Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions src/UglyToad.PdfPig.Core/ReadHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,12 @@ public static class ReadHelper
/// </summary>
public const byte AsciiCarriageReturn = 13;

/// <summary>
/// The tab '\t' character.
/// </summary>
public const byte AsciiTab = 9;

private static readonly HashSet<int> EndOfNameCharacters =
[
' ',
AsciiCarriageReturn,
AsciiLineFeed,
AsciiTab,
9,
'>',
'<',
'[',
Expand Down
11 changes: 0 additions & 11 deletions src/UglyToad.PdfPig.Core/StreamInputBytes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,6 @@ public bool IsAtEnd()
/// <inheritdoc />
public void Seek(long position)
{
var current = CurrentOffset;
if (position == current)
{
return;
}
else if (peekByte.HasValue && position == current + 1)
{
MoveNext();
return;
}

isAtEnd = false;
peekByte = null;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public sealed class Type1ArrayTokenizer : ITokenizer
/// <inheritdoc />
public bool ReadsNextByte { get; } = false;

private static readonly char[] Space = [' '];
private static readonly string[] Space = [" "];

/// <inheritdoc />
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
Expand Down
5 changes: 0 additions & 5 deletions src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,6 @@ public static Type1Font Parse(IInputBytes inputBytes, int length1, int length2)
{
int offset = 0;

while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b))
{
inputBytes.MoveNext();
}

while (inputBytes.MoveNext())
{
if (inputBytes.CurrentByte == (byte)ClearToMark[offset])
Expand Down
65 changes: 22 additions & 43 deletions src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Text;
using Core;
Expand Down Expand Up @@ -42,43 +41,35 @@ private Type1Token ReadNextToken()
do
{
skip = false;
while (bytes.Peek() is { } b)
while (bytes.MoveNext())
{
var b = bytes.CurrentByte;
var c = (char)b;

switch (c)
{
case '%':
bytes.MoveNext();
comments.Add(ReadComment());
break;
case '(':
bytes.MoveNext();
return ReadString();
case ')':
throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
case '[':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.StartArray);
case ']':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.EndArray);
case '{':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.StartProc);
case '}':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.EndProc);
case '/':
{
bytes.MoveNext();
TryReadLiteral(out var name);
Debug.Assert(name != null);
var name = ReadLiteral();
return new Type1Token(name, Type1Token.TokenType.Literal);
}
case '<':
{
bytes.MoveNext();
var following = bytes.Peek();
if (following == '<')
{
Expand All @@ -90,7 +81,6 @@ private Type1Token ReadNextToken()
}
case '>':
{
bytes.MoveNext();
var following = bytes.Peek();
if (following == '>')
{
Expand All @@ -104,24 +94,23 @@ private Type1Token ReadNextToken()
{
if (ReadHelper.IsWhitespace(b))
{
bytes.MoveNext();
skip = true;
break;
}

if (b == 0)
{
bytes.MoveNext();
skip = true;
break;
}

if (TryReadNumber(out var number))
if (TryReadNumber(c, out var number))
{
return number;
}

if (!TryReadLiteral(out var name))
var name = ReadLiteral(c);
if (name == null)
{
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
}
Expand Down Expand Up @@ -208,21 +197,12 @@ char GetNext()
return null;
}

private bool TryReadNumber(out Type1Token numberToken)
private bool TryReadNumber(char c, out Type1Token numberToken)
{
char GetNext()
{
bytes.MoveNext();
return (char)(bytes.Peek() ?? 0);
}

char c = (char)(bytes.Peek() ?? 0);

if (!((c >= '0' && c <= '9') || c is '+' or '-'))
{
// Easy out. Not a valid number
numberToken = null;
return false;
return (char)bytes.CurrentByte;
}

numberToken = null;
Expand Down Expand Up @@ -271,6 +251,8 @@ char GetNext()
else
{
// integer
bytes.Seek(bytes.CurrentOffset - 1);

numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
return true;
}
Expand Down Expand Up @@ -327,6 +309,7 @@ char GetNext()
}
}

bytes.Seek(bytes.CurrentOffset - 1);
if (radix != null)
{
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
Expand All @@ -340,9 +323,14 @@ char GetNext()
return true;
}

private bool TryReadLiteral(out string? value)
private string ReadLiteral(char? previousCharacter = null)
{
literalBuffer.Clear();
if (previousCharacter.HasValue)
{
literalBuffer.Append(previousCharacter);
}

do
{
var b = bytes.Peek();
Expand All @@ -362,16 +350,8 @@ private bool TryReadLiteral(out string? value)
literalBuffer.Append(c);
} while (bytes.MoveNext());

if (literalBuffer.Length > 0)
{
value = literalBuffer.ToString();
return true;
}
else
{
value = null;
return false;
}
var literal = literalBuffer.ToString();
return literal.Length == 0 ? null : literal;
}

private string ReadComment()
Expand All @@ -395,10 +375,9 @@ private string ReadComment()
private Type1DataToken ReadCharString(int length)
{
// Skip preceding space.
if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
{
bytes.MoveNext();
}
bytes.MoveNext();
// TODO: may be wrong
// bytes.MoveNext();

byte[] data = new byte[length];
for (int i = 0; i < length; i++)
Expand Down
33 changes: 33 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,43 @@
using DocumentLayoutAnalysis.PageSegmenter;
using DocumentLayoutAnalysis.WordExtractor;
using PdfPig.Core;
using PdfPig.Tokens;
using SkiaSharp;

public class GithubIssuesTests
{
[Fact]
public void Revert_e11dc6b()
{
var path = IntegrationHelpers.GetDocumentPath("GHOSTSCRIPT-699488-0.pdf");

using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }))
{
var page = document.GetPage(1);
var images = page.GetImages().ToArray();

Assert.Equal(9, images.Length);

foreach (var image in images)
{
if (image.ImageDictionary.TryGet(NameToken.Filter, out var token) && token is NameToken nt)
{
if (nt.Data.Contains("DCT"))
{
continue;
}
}

Assert.True(image.TryGetPng(out _));
}

var paths = page.Paths;
Assert.Equal(66, paths.Count);
var letters = page.Letters;
Assert.Equal(2685, letters.Count);
}
}

[Fact]
public void Issue1199()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,7 @@ public void OnlyParsesNumberPart()
Assert.True(result);
Assert.Equal(135.6654, AssertNumericToken(token).Data);

if (tokenizer.ReadsNextByte)
Assert.Equal('/', (char)input.Bytes.CurrentByte);
else
Assert.Equal('4', (char)input.Bytes.CurrentByte);
Assert.Equal('/', (char)input.Bytes.CurrentByte);
}

[Fact]
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ internal sealed class ArrayTokenizer : ITokenizer
{
private readonly bool usePdfDocEncoding;

public bool ReadsNextByte => false;
public bool ReadsNextByte { get; } = false;

public ArrayTokenizer(bool usePdfDocEncoding)
{
Expand Down
7 changes: 3 additions & 4 deletions src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

internal sealed class CommentTokenizer : ITokenizer
{
public bool ReadsNextByte => false;
public bool ReadsNextByte { get; } = true;

public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
Expand All @@ -17,11 +17,10 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
return false;
}

using var builder = new ValueStringBuilder(stackalloc char[32]);
using var builder = new ValueStringBuilder();

while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c))
while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte))
{
inputBytes.MoveNext();
builder.Append((char) inputBytes.CurrentByte);
}

Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ internal class DictionaryTokenizer : ITokenizer
private readonly IReadOnlyList<NameToken> requiredKeys;
private readonly bool useLenientParsing;

public bool ReadsNextByte => false;
public bool ReadsNextByte { get; } = false;

/// <summary>
/// Create a new <see cref="DictionaryTokenizer"/>.
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
public sealed class EndOfLineTokenizer : ITokenizer
{
/// <inheritdoc />
public bool ReadsNextByte => false;
public bool ReadsNextByte { get; } = false;

/// <inheritdoc />
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

internal sealed class HexTokenizer : ITokenizer
{
public bool ReadsNextByte => false;
public bool ReadsNextByte { get; } = false;

public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
Expand Down
Loading
Loading