diff --git a/src/UglyToad.PdfPig.Tests/Filters/BitStreamTests.cs b/src/UglyToad.PdfPig.Tests/Filters/BitStreamTests.cs index 98fd49ef6..a7064c2fb 100644 --- a/src/UglyToad.PdfPig.Tests/Filters/BitStreamTests.cs +++ b/src/UglyToad.PdfPig.Tests/Filters/BitStreamTests.cs @@ -1,6 +1,6 @@ namespace UglyToad.PdfPig.Tests.Filters { - using PdfPig.Filters; + using PdfPig.Filters.Lzw; public class BitStreamTests { diff --git a/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs new file mode 100644 index 000000000..735b6ae10 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs @@ -0,0 +1,126 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using PdfPig.Filters; + using PdfPig.Tokens; + using System; + using System.Collections.Generic; + using System.Linq; + + public class FilterTests + { + private static readonly Lazy DocumentFolder = new Lazy(() => Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"))); + private static readonly HashSet _documentsToIgnore = + [ + "issue_671.pdf", + "GHOSTSCRIPT-698363-0.pdf", + "ErcotFacts.pdf" + ]; + + [Theory] + [MemberData(nameof(GetAllDocuments))] + public void NoImageDecoding(string documentName) + { + // Add the full path back on, we removed it so we could see it in the test explorer. + documentName = Path.Combine(DocumentFolder.Value, documentName); + + var parsingOptions = new ParsingOptions + { + UseLenientParsing = true, + FilterProvider = MyFilterProvider.Instance + }; + + using (var document = PdfDocument.Open(documentName, parsingOptions)) + { + for (var i = 0; i < document.NumberOfPages; i++) + { + var page = document.GetPage(i + 1); + + foreach (var pdfImage in page.GetImages()) + { + if (pdfImage.ImageDictionary.TryGet(NameToken.Filter, out NameToken filter)) + { + if (filter.Data.Equals(NameToken.FlateDecode.Data) || + filter.Data.Equals(NameToken.FlateDecodeAbbreviation.Data) || + filter.Data.Equals(NameToken.LzwDecode.Data) || + filter.Data.Equals(NameToken.LzwDecodeAbbreviation.Data)) + { + continue; + } + } + else + { + continue; + } + + Assert.False(pdfImage.TryGetPng(out _)); + } + } + } + } + + public sealed class NoFilter : IFilter + { + public bool IsSupported => false; + + public ReadOnlyMemory Decode(ReadOnlySpan input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotImplementedException(); + } + } + + public class MyFilterProvider : BaseFilterProvider + { + /// + /// The single instance of this provider. + /// + public static readonly IFilterProvider Instance = new MyFilterProvider(); + + /// + protected MyFilterProvider() : base(GetDictionary()) + { + } + + private static Dictionary GetDictionary() + { + var ascii85 = new Ascii85Filter(); + var asciiHex = new AsciiHexDecodeFilter(); + var flate = new FlateFilter(); + var runLength = new RunLengthFilter(); + var lzw = new LzwFilter(); + + var noFilter = new NoFilter(); + + return new Dictionary + { + { NameToken.Ascii85Decode.Data, ascii85 }, + { NameToken.Ascii85DecodeAbbreviation.Data, ascii85 }, + { NameToken.AsciiHexDecode.Data, asciiHex }, + { NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex }, + { NameToken.CcittfaxDecode.Data, noFilter }, + { NameToken.CcittfaxDecodeAbbreviation.Data, noFilter }, + { NameToken.DctDecode.Data, noFilter }, + { NameToken.DctDecodeAbbreviation.Data, noFilter }, + { NameToken.FlateDecode.Data, flate }, + { NameToken.FlateDecodeAbbreviation.Data, flate }, + { NameToken.Jbig2Decode.Data, noFilter }, + { NameToken.JpxDecode.Data, noFilter }, + { NameToken.RunLengthDecode.Data, runLength }, + { NameToken.RunLengthDecodeAbbreviation.Data, runLength }, + {NameToken.LzwDecode, lzw }, + {NameToken.LzwDecodeAbbreviation, lzw } + }; + } + } + + public static IEnumerable GetAllDocuments + { + get + { + var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf"); + + // Return the shortname so we can see it in the test explorer. + return files.Where(x => !_documentsToIgnore.Any(i => x.EndsWith(i))).Select(x => new object[] { Path.GetFileName(x) }); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index dc387e6e8..364bb53b3 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -97,10 +97,20 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", + "UglyToad.PdfPig.Filters.BaseFilterProvider", "UglyToad.PdfPig.Filters.DefaultFilterProvider", "UglyToad.PdfPig.Filters.IFilter", "UglyToad.PdfPig.Filters.IFilterProvider", "UglyToad.PdfPig.Filters.ILookupFilterProvider", + "UglyToad.PdfPig.Filters.Ascii85Filter", + "UglyToad.PdfPig.Filters.AsciiHexDecodeFilter", + "UglyToad.PdfPig.Filters.CcittFaxDecodeFilter", + "UglyToad.PdfPig.Filters.DctDecodeFilter", + "UglyToad.PdfPig.Filters.FlateFilter", + "UglyToad.PdfPig.Filters.Jbig2DecodeFilter", + "UglyToad.PdfPig.Filters.JpxDecodeFilter", + "UglyToad.PdfPig.Filters.LzwFilter", + "UglyToad.PdfPig.Filters.RunLengthFilter", "UglyToad.PdfPig.Functions.FunctionTypes", "UglyToad.PdfPig.Functions.PdfFunction", "UglyToad.PdfPig.PdfFonts.CharacterBoundingBox", diff --git a/src/UglyToad.PdfPig.sln.DotSettings b/src/UglyToad.PdfPig.sln.DotSettings index 9ae00f926..5ecaf9dd4 100644 --- a/src/UglyToad.PdfPig.sln.DotSettings +++ b/src/UglyToad.PdfPig.sln.DotSettings @@ -10,7 +10,10 @@ XY <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> <Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /> + <Policy><Descriptor Staticness="Instance" AccessRightKinds="Private" Description="Instance fields (private)"><ElementKinds><Kind Name="FIELD" /><Kind Name="READONLY_FIELD" /></ElementKinds></Descriptor><Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /></Policy> + <Policy><Descriptor Staticness="Static" AccessRightKinds="Private" Description="Static fields (private)"><ElementKinds><Kind Name="FIELD" /></ElementKinds></Descriptor><Policy Inspect="True" Prefix="" Suffix="" Style="aaBb" /></Policy> True True True - True \ No newline at end of file + True + True \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs index c0657a63f..e5a16240b 100644 --- a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs +++ b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs @@ -4,11 +4,10 @@ using Core; using Tokens; - /// /// /// ASCII 85 (Base85) is a binary to text encoding using 5 ASCII characters per 4 bytes of data. /// - internal sealed class Ascii85Filter : IFilter + public sealed class Ascii85Filter : IFilter { private const byte EmptyBlock = (byte)'z'; private const byte Offset = (byte)'!'; diff --git a/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs index d61fdd9b4..c6558b7f2 100644 --- a/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs @@ -4,11 +4,10 @@ using Core; using Tokens; - /// /// /// Encodes/decodes data using the ASCII hexadecimal encoding where each byte is represented by two ASCII characters. /// - internal sealed class AsciiHexDecodeFilter : IFilter + public sealed class AsciiHexDecodeFilter : IFilter { private static readonly short[] ReverseHex = [ diff --git a/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs b/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs new file mode 100644 index 000000000..0e51f4cff --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs @@ -0,0 +1,96 @@ +namespace UglyToad.PdfPig.Filters +{ + using Core; + using System; + using System.Collections.Generic; + using System.Linq; + using Tokens; + using Util; + + /// + /// Base abstract class for FilterProvider. + /// + public abstract class BaseFilterProvider : IFilterProvider + { + /// + /// Dictionary of filters. + /// + protected readonly IReadOnlyDictionary FilterInstances; + + /// + /// Create a new with the given filters. + /// + /// + protected BaseFilterProvider(IReadOnlyDictionary filterInstances) + { + FilterInstances = filterInstances; + } + + /// + public IReadOnlyList GetFilters(DictionaryToken dictionary) + { + if (dictionary is null) + { + throw new ArgumentNullException(nameof(dictionary)); + } + + var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F); + if (token is null) + { + return Array.Empty(); + } + + switch (token) + { + case ArrayToken filters: + var result = new IFilter[filters.Data.Count]; + for (var i = 0; i < filters.Data.Count; i++) + { + var filterToken = filters.Data[i]; + var filterName = ((NameToken)filterToken).Data; + result[i] = GetFilterStrict(filterName); + } + + return result; + case NameToken name: + return new[] { GetFilterStrict(name.Data) }; + default: + throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}."); + } + } + + /// + public IReadOnlyList GetNamedFilters(IReadOnlyList names) + { + if (names is null) + { + throw new ArgumentNullException(nameof(names)); + } + + var result = new List(); + + foreach (var name in names) + { + result.Add(GetFilterStrict(name)); + } + + return result; + } + + private IFilter GetFilterStrict(string name) + { + if (!FilterInstances.TryGetValue(name, out var factory)) + { + throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue."); + } + + return factory; + } + + /// + public IReadOnlyList GetAllFilters() + { + return FilterInstances.Values.Distinct().ToList(); + } + } +} diff --git a/src/UglyToad.PdfPig/Filters/CcittFaxCompressionType.cs b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs similarity index 86% rename from src/UglyToad.PdfPig/Filters/CcittFaxCompressionType.cs rename to src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs index f0f372ded..723494116 100644 --- a/src/UglyToad.PdfPig/Filters/CcittFaxCompressionType.cs +++ b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs @@ -1,25 +1,25 @@ -namespace UglyToad.PdfPig.Filters -{ - /// - /// Specifies the compression type to use with . - /// - internal enum CcittFaxCompressionType - { - /// - /// Modified Huffman (MH) - Group 3 variation (T2) - /// - ModifiedHuffman, - /// - /// Modified Huffman (MH) - Group 3 (T4) - /// - Group3_1D, - /// - /// Modified Read (MR) - Group 3 (T4) - /// - Group3_2D, - /// - /// Modified Modified Read (MMR) - Group 4 (T6) - /// - Group4_2D - } -} +namespace UglyToad.PdfPig.Filters.CcittFax +{ + /// + /// Specifies the compression type to use with . + /// + internal enum CcittFaxCompressionType : byte + { + /// + /// Modified Huffman (MH) - Group 3 variation (T2) + /// + ModifiedHuffman, + /// + /// Modified Huffman (MH) - Group 3 (T4) + /// + Group3_1D, + /// + /// Modified Read (MR) - Group 3 (T4) + /// + Group3_2D, + /// + /// Modified Modified Read (MMR) - Group 4 (T6) + /// + Group4_2D + } +} diff --git a/src/UglyToad.PdfPig/Filters/CcittFaxDecoderStream.cs b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxDecoderStream.cs similarity index 94% rename from src/UglyToad.PdfPig/Filters/CcittFaxDecoderStream.cs rename to src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxDecoderStream.cs index 12aa8fa9a..24a30b6fe 100644 --- a/src/UglyToad.PdfPig/Filters/CcittFaxDecoderStream.cs +++ b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxDecoderStream.cs @@ -1,773 +1,774 @@ -namespace UglyToad.PdfPig.Filters -{ - using System; - using System.IO; - using IO; - - /// - /// CCITT Modified Huffman RLE, Group 3 (T4) and Group 4 (T6) fax compression. - /// - /// Ported from https://github.com/apache/pdfbox/blob/e644c29279e276bde14ce7a33bdeef0cb1001b3e/pdfbox/src/main/java/org/apache/pdfbox/filter/CCITTFaxDecoderStream.java - /// - internal sealed class CcittFaxDecoderStream : StreamWrapper - { - // See TIFF 6.0 Specification, Section 10: "Modified Huffman Compression", page 43. - - private readonly int columns; - private readonly byte[] decodedRow; - - private readonly bool optionByteAligned; - - private readonly CcittFaxCompressionType type; - - private int decodedLength; - private int decodedPos; - - private int[] changesReferenceRow; - private int[] changesCurrentRow; - private int changesReferenceRowCount; - private int changesCurrentRowCount; - - private int lastChangingElement; - - private int buffer = -1; - private int bufferPos = -1; - - /// - /// Creates a CCITTFaxDecoderStream. - /// This constructor may be used for CCITT streams embedded in PDF files, - /// which use EncodedByteAlign. - /// - public CcittFaxDecoderStream(Stream stream, int columns, CcittFaxCompressionType type, bool byteAligned) - : base(stream) - { - this.columns = columns; - this.type = type; - - // We know this is only used for b/w (1 bit) - decodedRow = new byte[(columns + 7) / 8]; - changesReferenceRow = new int[columns + 2]; - changesCurrentRow = new int[columns + 2]; - - optionByteAligned = byteAligned; - } - - private void Fetch() - { - if (decodedPos >= decodedLength) - { - decodedLength = 0; - - try - { - DecodeRow(); - } - catch (InvalidOperationException) - { - if (decodedLength != 0) - { - throw; - } - - // ..otherwise, just let client code try to read past the - // end of stream - decodedLength = -1; - } - - decodedPos = 0; - } - } - - private void Decode1D() - { - var index = 0; - var white = true; - changesCurrentRowCount = 0; - - do - { - var completeRun = white ? DecodeRun(WhiteRunTree) : DecodeRun(BlackRunTree); - index += completeRun; - changesCurrentRow[changesCurrentRowCount++] = index; - - // Flip color for next run - white = !white; - } while (index < columns); - } - - private void Decode2D() - { - changesReferenceRowCount = changesCurrentRowCount; - var tmp = changesCurrentRow; - changesCurrentRow = changesReferenceRow; - changesReferenceRow = tmp; - - var white = true; - var index = 0; - changesCurrentRowCount = 0; - - mode: while (index < columns) - { - var node = CodeTree.Root; - - while (true) - { - node = node.Walk(ReadBit()); - - if (node is null) - { - goto mode; - } - else if (node.IsLeaf) - { - switch (node.Value) - { - case VALUE_HMODE: - var runLength = DecodeRun(white ? WhiteRunTree : BlackRunTree); - index += runLength; - changesCurrentRow[changesCurrentRowCount++] = index; - - runLength = DecodeRun(white ? BlackRunTree : WhiteRunTree); - index += runLength; - changesCurrentRow[changesCurrentRowCount++] = index; - break; - - case VALUE_PASSMODE: - var pChangingElement = GetNextChangingElement(index, white) + 1; - - if (pChangingElement >= changesReferenceRowCount) - { - index = columns; - } - else - { - index = changesReferenceRow[pChangingElement]; - } - - break; - - default: - // Vertical mode (-3 to 3) - var vChangingElement = GetNextChangingElement(index, white); - - if (vChangingElement >= changesReferenceRowCount || vChangingElement == -1) - { - index = columns + node.Value; - } - else - { - index = changesReferenceRow[vChangingElement] + node.Value; - } - - changesCurrentRow[changesCurrentRowCount] = index; - changesCurrentRowCount++; - white = !white; - - break; - } - - goto mode; - } - } - } - } - - private int GetNextChangingElement(int a0, bool white) - { - var start = (int)(lastChangingElement & 0xFFFF_FFFE) + (white ? 0 : 1); - if (start > 2) - { - start -= 2; - } - - if (a0 == 0) - { - return start; - } - - for (var i = start; i < changesReferenceRowCount; i += 2) - { - if (a0 < changesReferenceRow[i]) - { - lastChangingElement = i; - return i; - } - } - - return -1; - } - - private void DecodeRowType2() - { - if (optionByteAligned) - { - ResetBuffer(); - } - - Decode1D(); - } - - private void DecodeRowType4() - { - if (optionByteAligned) - { - ResetBuffer(); - } - - eof: while (true) - { - // read till next EOL code - var node = EolOnlyTree.Root; - - while (true) - { - node = node.Walk(ReadBit()); - - if (node is null) - { - goto eof; - } - - if (node.IsLeaf) - { - goto done; - } - } - } - - done: - if (type == CcittFaxCompressionType.Group3_1D || ReadBit()) - { - Decode1D(); - } - else - { - Decode2D(); - } - } - - private void DecodeRowType6() - { - if (optionByteAligned) - { - ResetBuffer(); - } - - Decode2D(); - } - - private void DecodeRow() - { - switch (type) - { - case CcittFaxCompressionType.ModifiedHuffman: - DecodeRowType2(); - break; - case CcittFaxCompressionType.Group3_1D: - case CcittFaxCompressionType.Group3_2D: - DecodeRowType4(); - break; - case CcittFaxCompressionType.Group4_2D: - DecodeRowType6(); - break; - default: - throw new InvalidOperationException(type + " is not a supported compression type."); - } - - var index = 0; - var white = true; - - lastChangingElement = 0; - for (var i = 0; i <= changesCurrentRowCount; i++) - { - var nextChange = columns; - - if (i != changesCurrentRowCount) - { - nextChange = changesCurrentRow[i]; - } - - if (nextChange > columns) - { - nextChange = columns; - } - - var byteIndex = index / 8; - - while (index % 8 != 0 && (nextChange - index) > 0) - { - decodedRow[byteIndex] |= (byte)(white ? 0 : 1 << (7 - ((index) % 8))); - index++; - } - - if (index % 8 == 0) - { - byteIndex = index / 8; - var value = (byte)(white ? 0x00 : 0xff); - - while ((nextChange - index) > 7) - { - decodedRow[byteIndex] = value; - index += 8; - ++byteIndex; - } - } - - while ((nextChange - index) > 0) - { - if (index % 8 == 0) - { - decodedRow[byteIndex] = 0; - } - - decodedRow[byteIndex] |= (byte)(white ? 0 : 1 << (7 - ((index) % 8))); - index++; - } - - white = !white; - } - - if (index != columns) - { - throw new InvalidOperationException($"Sum of run-lengths does not equal scan line width: {index} > {columns}"); - } - - decodedLength = (index + 7) / 8; - } - - private int DecodeRun(Tree tree) - { - var total = 0; - - var node = tree.Root; - - while (true) - { - var bit = ReadBit(); - node = node.Walk(bit); - - if (node is null) - { - throw new InvalidOperationException("Unknown code in Huffman RLE stream"); - } - - if (node.IsLeaf) - { - total += node.Value; - if (node.Value >= 64) - { - node = tree.Root; - } - else if (node.Value >= 0) - { - return total; - } - else - { - return columns; - } - } - } - } - - private void ResetBuffer() - { - bufferPos = -1; - } - - private bool ReadBit() - { - if (bufferPos < 0 || bufferPos > 7) - { - buffer = Stream.ReadByte(); - - if (buffer == -1) - { - throw new InvalidOperationException("Unexpected end of Huffman RLE stream"); - } - - bufferPos = 0; - } - - var isSet = ((buffer >> (7 - bufferPos)) & 1) == 1; - - bufferPos++; - - if (bufferPos > 7) - { - bufferPos = -1; - } - - return isSet; - } - - public override int ReadByte() - { - if (decodedLength < 0) - { - return 0x0; - } - - if (decodedPos >= decodedLength) - { - Fetch(); - - if (decodedLength < 0) - { - return 0x0; - } - } - - return decodedRow[decodedPos++] & 0xff; - } - - public override int Read(byte[] b, int off, int len) - { - if (decodedLength < 0) - { - b.AsSpan(off, len).Fill(0x0); - return len; - } - - if (decodedPos >= decodedLength) - { - Fetch(); - - if (decodedLength < 0) - { - b.AsSpan(off, len).Fill(0x0); - return len; - } - } - - var read = Math.Min(decodedLength - decodedPos, len); - Array.Copy(decodedRow, decodedPos, b, off, read); - decodedPos += read; - - return read; - } - - private class Node - { - public Node? Left { get; set; } - public Node? Right { get; set; } - - public int Value { get; set; } - - public bool CanBeFill { get; set; } - public bool IsLeaf { get; set; } - - public void Set(bool next, Node node) - { - if (!next) - { - Left = node; - } - else - { - Right = node; - } - } - - public Node Walk(bool next) - { - return next ? Right! : Left!; - } - - public override string ToString() - { - return $"[{nameof(IsLeaf)}={IsLeaf}, {nameof(Value)}={Value}, {nameof(CanBeFill)}={CanBeFill}]"; - } - } - - private class Tree - { - public Node Root { get; } = new Node(); - - public void Fill(int depth, int path, int value) - { - var current = Root; - - for (var i = 0; i < depth; i++) - { - var bitPos = depth - 1 - i; - var isSet = ((path >> bitPos) & 1) == 1; - var next = current.Walk(isSet); - - if (next is null) - { - next = new Node(); - - if (i == depth - 1) - { - next.Value = value; - next.IsLeaf = true; - } - - if (path == 0) - { - next.CanBeFill = true; - } - - current.Set(isSet, next); - } - else if (next.IsLeaf) - { - throw new InvalidOperationException("node is leaf, no other following"); - } - - current = next; - } - } - - public void Fill(int depth, int path, Node node) - { - var current = Root; - - for (var i = 0; i < depth; i++) - { - var bitPos = depth - 1 - i; - var isSet = ((path >> bitPos) & 1) == 1; - var next = current.Walk(isSet); - - if (next is null) - { - if (i == depth - 1) - { - next = node; - } - else - { - next = new Node(); - } - - if (path == 0) - { - next.CanBeFill = true; - } - - current.Set(isSet, next); - } - else if (next.IsLeaf) - { - throw new InvalidOperationException("node is leaf, no other following"); - } - - current = next; - } - } - } - - private static readonly short[][] BLACK_CODES = new short[][] { - new short[]{ // 2 bits - 0x2, 0x3, - }, - new short[]{ // 3 bits - 0x2, 0x3, - }, - new short[]{ // 4 bits - 0x2, 0x3, - }, - new short[]{ // 5 bits - 0x3, - }, - new short[]{ // 6 bits - 0x4, 0x5, - }, - new short[]{ // 7 bits - 0x4, 0x5, 0x7, - }, - new short[]{ // 8 bits - 0x4, 0x7, - }, - new short[]{ // 9 bits - 0x18, - }, - new short[]{ // 10 bits - 0x17, 0x18, 0x37, 0x8, 0xf, - }, - new short[]{ // 11 bits - 0x17, 0x18, 0x28, 0x37, 0x67, 0x68, 0x6c, 0x8, 0xc, 0xd, - }, - new short[]{ // 12 bits - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f, 0x24, 0x27, 0x28, 0x2b, 0x2c, 0x33, - 0x34, 0x35, 0x37, 0x38, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x64, 0x65, - 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xd2, 0xd3, - 0xd4, 0xd5, 0xd6, 0xd7, 0xda, 0xdb, - }, - new short[]{ // 13 bits - 0x4a, 0x4b, 0x4c, 0x4d, 0x52, 0x53, 0x54, 0x55, 0x5a, 0x5b, 0x64, 0x65, 0x6c, 0x6d, 0x72, 0x73, - 0x74, 0x75, 0x76, 0x77, - } - }; - - private static readonly short[][] BLACK_RUN_LENGTHS = new short[][]{ - new short[]{ // 2 bits - 3, 2, - }, - new short[]{ // 3 bits - 1, 4, - }, - new short[]{ // 4 bits - 6, 5, - }, - new short[]{ // 5 bits - 7, - }, - new short[]{ // 6 bits - 9, 8, - }, - new short[]{ // 7 bits - 10, 11, 12, - }, - new short[]{ // 8 bits - 13, 14, - }, - new short[]{ // 9 bits - 15, - }, - new short[]{ // 10 bits - 16, 17, 0, 18, 64, - }, - new short[]{ // 11 bits - 24, 25, 23, 22, 19, 20, 21, 1792, 1856, 1920, - }, - new short[]{ // 12 bits - 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 52, 55, 56, 59, 60, 320, 384, 448, 53, - 54, 50, 51, 44, 45, 46, 47, 57, 58, 61, 256, 48, 49, 62, 63, 30, 31, 32, 33, 40, 41, 128, 192, 26, - 27, 28, 29, 34, 35, 36, 37, 38, 39, 42, 43, - }, - new short[]{ // 13 bits - 640, 704, 768, 832, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 512, 576, 896, 960, 1024, 1088, - 1152, 1216, - } - }; - - private static readonly short[][] WHITE_CODES = new short[][]{ - new short[]{ // 4 bits - 0x7, 0x8, 0xb, 0xc, 0xe, 0xf, - }, - new short[]{ // 5 bits - 0x12, 0x13, 0x14, 0x1b, 0x7, 0x8, - }, - new short[]{ // 6 bits - 0x17, 0x18, 0x2a, 0x2b, 0x3, 0x34, 0x35, 0x7, 0x8, - }, - new short[]{ // 7 bits - 0x13, 0x17, 0x18, 0x24, 0x27, 0x28, 0x2b, 0x3, 0x37, 0x4, 0x8, 0xc, - }, - new short[]{ // 8 bits - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1a, 0x1b, 0x2, 0x24, 0x25, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, - 0x3, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x4, 0x4a, 0x4b, 0x5, 0x52, 0x53, 0x54, 0x55, 0x58, 0x59, - 0x5a, 0x5b, 0x64, 0x65, 0x67, 0x68, 0xa, 0xb, - }, - new short[]{ // 9 bits - 0x98, 0x99, 0x9a, 0x9b, 0xcc, 0xcd, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, - }, - new short[]{ // 10 bits - }, - new short[]{ // 11 bits - 0x8, 0xc, 0xd, - }, - new short[]{ // 12 bits - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f, - } - }; - - private static readonly short[][] WHITE_RUN_LENGTHS = new short[][]{ - new short[]{ // 4 bits - 2, 3, 4, 5, 6, 7, - }, - new short[]{ // 5 bits - 128, 8, 9, 64, 10, 11, - }, - new short[]{ // 6 bits - 192, 1664, 16, 17, 13, 14, 15, 1, 12, - }, - new short[]{ // 7 bits - 26, 21, 28, 27, 18, 24, 25, 22, 256, 23, 20, 19, - }, - new short[]{ // 8 bits - 33, 34, 35, 36, 37, 38, 31, 32, 29, 53, 54, 39, 40, 41, 42, 43, 44, 30, 61, 62, 63, 0, 320, 384, 45, - 59, 60, 46, 49, 50, 51, 52, 55, 56, 57, 58, 448, 512, 640, 576, 47, 48, - }, - new short[]{ // 9 bits - 1472, 1536, 1600, 1728, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, - }, - new short[]{ // 10 bits - }, - new short[]{ // 11 bits - 1792, 1856, 1920, - }, - new short[]{ // 12 bits - 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, - } - }; - - private static readonly Node EOL; - private static readonly Node FILL; - private static readonly Tree BlackRunTree; - private static readonly Tree WhiteRunTree; - private static readonly Tree EolOnlyTree; - private static readonly Tree CodeTree; - - const int VALUE_EOL = -2000; - const int VALUE_FILL = -1000; - const int VALUE_PASSMODE = -3000; - const int VALUE_HMODE = -4000; - - static CcittFaxDecoderStream() - { - EOL = new Node - { - IsLeaf = true, - Value = VALUE_EOL - }; - FILL = new Node - { - Value = VALUE_FILL - }; - FILL.Left = FILL; - FILL.Right = EOL; - - EolOnlyTree = new Tree(); - EolOnlyTree.Fill(12, 0, FILL); - EolOnlyTree.Fill(12, 1, EOL); - - BlackRunTree = new Tree(); - for (var i = 0; i < BLACK_CODES.Length; i++) - { - for (var j = 0; j < BLACK_CODES[i].Length; j++) - { - BlackRunTree.Fill(i + 2, BLACK_CODES[i][j], BLACK_RUN_LENGTHS[i][j]); - } - } - BlackRunTree.Fill(12, 0, FILL); - BlackRunTree.Fill(12, 1, EOL); - - WhiteRunTree = new Tree(); - - for (var i = 0; i < WHITE_CODES.Length; i++) - { - for (var j = 0; j < WHITE_CODES[i].Length; j++) - { - WhiteRunTree.Fill(i + 4, WHITE_CODES[i][j], WHITE_RUN_LENGTHS[i][j]); - } - } - - WhiteRunTree.Fill(12, 0, FILL); - WhiteRunTree.Fill(12, 1, EOL); - - CodeTree = new Tree(); - CodeTree.Fill(4, 1, VALUE_PASSMODE); // pass mode - CodeTree.Fill(3, 1, VALUE_HMODE); // H mode - CodeTree.Fill(1, 1, 0); // V(0) - CodeTree.Fill(3, 3, 1); // V_R(1) - CodeTree.Fill(6, 3, 2); // V_R(2) - CodeTree.Fill(7, 3, 3); // V_R(3) - CodeTree.Fill(3, 2, -1); // V_L(1) - CodeTree.Fill(6, 2, -2); // V_L(2) - CodeTree.Fill(7, 2, -3); // V_L(3) - } - } +namespace UglyToad.PdfPig.Filters.CcittFax +{ + using System; + using System.IO; + using IO; + + /// + /// CCITT Modified Huffman RLE, Group 3 (T4) and Group 4 (T6) fax compression. + /// + /// Ported from https://github.com/apache/pdfbox/blob/e644c29279e276bde14ce7a33bdeef0cb1001b3e/pdfbox/src/main/java/org/apache/pdfbox/filter/CCITTFaxDecoderStream.java + /// + /// + internal sealed class CcittFaxDecoderStream : StreamWrapper + { + // See TIFF 6.0 Specification, Section 10: "Modified Huffman Compression", page 43. + + private readonly int columns; + private readonly byte[] decodedRow; + + private readonly bool optionByteAligned; + + private readonly CcittFaxCompressionType type; + + private int decodedLength; + private int decodedPos; + + private int[] changesReferenceRow; + private int[] changesCurrentRow; + private int changesReferenceRowCount; + private int changesCurrentRowCount; + + private int lastChangingElement; + + private int buffer = -1; + private int bufferPos = -1; + + /// + /// Creates a CCITTFaxDecoderStream. + /// This constructor may be used for CCITT streams embedded in PDF files, + /// which use EncodedByteAlign. + /// + public CcittFaxDecoderStream(Stream stream, int columns, CcittFaxCompressionType type, bool byteAligned) + : base(stream) + { + this.columns = columns; + this.type = type; + + // We know this is only used for b/w (1 bit) + decodedRow = new byte[(columns + 7) / 8]; + changesReferenceRow = new int[columns + 2]; + changesCurrentRow = new int[columns + 2]; + + optionByteAligned = byteAligned; + } + + private void Fetch() + { + if (decodedPos >= decodedLength) + { + decodedLength = 0; + + try + { + DecodeRow(); + } + catch (InvalidOperationException) + { + if (decodedLength != 0) + { + throw; + } + + // ..otherwise, just let client code try to read past the + // end of stream + decodedLength = -1; + } + + decodedPos = 0; + } + } + + private void Decode1D() + { + var index = 0; + var white = true; + changesCurrentRowCount = 0; + + do + { + var completeRun = white ? DecodeRun(WhiteRunTree) : DecodeRun(BlackRunTree); + index += completeRun; + changesCurrentRow[changesCurrentRowCount++] = index; + + // Flip color for next run + white = !white; + } while (index < columns); + } + + private void Decode2D() + { + changesReferenceRowCount = changesCurrentRowCount; + var tmp = changesCurrentRow; + changesCurrentRow = changesReferenceRow; + changesReferenceRow = tmp; + + var white = true; + var index = 0; + changesCurrentRowCount = 0; + + mode: while (index < columns) + { + var node = CodeTree.Root; + + while (true) + { + node = node.Walk(ReadBit()); + + if (node is null) + { + goto mode; + } + else if (node.IsLeaf) + { + switch (node.Value) + { + case VALUE_HMODE: + var runLength = DecodeRun(white ? WhiteRunTree : BlackRunTree); + index += runLength; + changesCurrentRow[changesCurrentRowCount++] = index; + + runLength = DecodeRun(white ? BlackRunTree : WhiteRunTree); + index += runLength; + changesCurrentRow[changesCurrentRowCount++] = index; + break; + + case VALUE_PASSMODE: + var pChangingElement = GetNextChangingElement(index, white) + 1; + + if (pChangingElement >= changesReferenceRowCount) + { + index = columns; + } + else + { + index = changesReferenceRow[pChangingElement]; + } + + break; + + default: + // Vertical mode (-3 to 3) + var vChangingElement = GetNextChangingElement(index, white); + + if (vChangingElement >= changesReferenceRowCount || vChangingElement == -1) + { + index = columns + node.Value; + } + else + { + index = changesReferenceRow[vChangingElement] + node.Value; + } + + changesCurrentRow[changesCurrentRowCount] = index; + changesCurrentRowCount++; + white = !white; + + break; + } + + goto mode; + } + } + } + } + + private int GetNextChangingElement(int a0, bool white) + { + var start = (int)(lastChangingElement & 0xFFFF_FFFE) + (white ? 0 : 1); + if (start > 2) + { + start -= 2; + } + + if (a0 == 0) + { + return start; + } + + for (var i = start; i < changesReferenceRowCount; i += 2) + { + if (a0 < changesReferenceRow[i]) + { + lastChangingElement = i; + return i; + } + } + + return -1; + } + + private void DecodeRowType2() + { + if (optionByteAligned) + { + ResetBuffer(); + } + + Decode1D(); + } + + private void DecodeRowType4() + { + if (optionByteAligned) + { + ResetBuffer(); + } + + eof: while (true) + { + // read till next EOL code + var node = EolOnlyTree.Root; + + while (true) + { + node = node.Walk(ReadBit()); + + if (node is null) + { + goto eof; + } + + if (node.IsLeaf) + { + goto done; + } + } + } + + done: + if (type == CcittFaxCompressionType.Group3_1D || ReadBit()) + { + Decode1D(); + } + else + { + Decode2D(); + } + } + + private void DecodeRowType6() + { + if (optionByteAligned) + { + ResetBuffer(); + } + + Decode2D(); + } + + private void DecodeRow() + { + switch (type) + { + case CcittFaxCompressionType.ModifiedHuffman: + DecodeRowType2(); + break; + case CcittFaxCompressionType.Group3_1D: + case CcittFaxCompressionType.Group3_2D: + DecodeRowType4(); + break; + case CcittFaxCompressionType.Group4_2D: + DecodeRowType6(); + break; + default: + throw new InvalidOperationException(type + " is not a supported compression type."); + } + + var index = 0; + var white = true; + + lastChangingElement = 0; + for (var i = 0; i <= changesCurrentRowCount; i++) + { + var nextChange = columns; + + if (i != changesCurrentRowCount) + { + nextChange = changesCurrentRow[i]; + } + + if (nextChange > columns) + { + nextChange = columns; + } + + var byteIndex = index / 8; + + while (index % 8 != 0 && nextChange - index > 0) + { + decodedRow[byteIndex] |= (byte)(white ? 0 : 1 << 7 - index % 8); + index++; + } + + if (index % 8 == 0) + { + byteIndex = index / 8; + var value = (byte)(white ? 0x00 : 0xff); + + while (nextChange - index > 7) + { + decodedRow[byteIndex] = value; + index += 8; + ++byteIndex; + } + } + + while (nextChange - index > 0) + { + if (index % 8 == 0) + { + decodedRow[byteIndex] = 0; + } + + decodedRow[byteIndex] |= (byte)(white ? 0 : 1 << 7 - index % 8); + index++; + } + + white = !white; + } + + if (index != columns) + { + throw new InvalidOperationException($"Sum of run-lengths does not equal scan line width: {index} > {columns}"); + } + + decodedLength = (index + 7) / 8; + } + + private int DecodeRun(Tree tree) + { + var total = 0; + + var node = tree.Root; + + while (true) + { + var bit = ReadBit(); + node = node.Walk(bit); + + if (node is null) + { + throw new InvalidOperationException("Unknown code in Huffman RLE stream"); + } + + if (node.IsLeaf) + { + total += node.Value; + if (node.Value >= 64) + { + node = tree.Root; + } + else if (node.Value >= 0) + { + return total; + } + else + { + return columns; + } + } + } + } + + private void ResetBuffer() + { + bufferPos = -1; + } + + private bool ReadBit() + { + if (bufferPos < 0 || bufferPos > 7) + { + buffer = Stream.ReadByte(); + + if (buffer == -1) + { + throw new InvalidOperationException("Unexpected end of Huffman RLE stream"); + } + + bufferPos = 0; + } + + var isSet = (buffer >> 7 - bufferPos & 1) == 1; + + bufferPos++; + + if (bufferPos > 7) + { + bufferPos = -1; + } + + return isSet; + } + + public override int ReadByte() + { + if (decodedLength < 0) + { + return 0x0; + } + + if (decodedPos >= decodedLength) + { + Fetch(); + + if (decodedLength < 0) + { + return 0x0; + } + } + + return decodedRow[decodedPos++] & 0xff; + } + + public override int Read(byte[] b, int off, int len) + { + if (decodedLength < 0) + { + b.AsSpan(off, len).Fill(0x0); + return len; + } + + if (decodedPos >= decodedLength) + { + Fetch(); + + if (decodedLength < 0) + { + b.AsSpan(off, len).Fill(0x0); + return len; + } + } + + var read = Math.Min(decodedLength - decodedPos, len); + Array.Copy(decodedRow, decodedPos, b, off, read); + decodedPos += read; + + return read; + } + + private sealed class Node + { + public Node? Left { get; set; } + public Node? Right { get; set; } + + public int Value { get; set; } + + public bool CanBeFill { get; set; } + public bool IsLeaf { get; set; } + + public void Set(bool next, Node node) + { + if (!next) + { + Left = node; + } + else + { + Right = node; + } + } + + public Node Walk(bool next) + { + return next ? Right! : Left!; + } + + public override string ToString() + { + return $"[{nameof(IsLeaf)}={IsLeaf}, {nameof(Value)}={Value}, {nameof(CanBeFill)}={CanBeFill}]"; + } + } + + private sealed class Tree + { + public Node Root { get; } = new Node(); + + public void Fill(int depth, int path, int value) + { + var current = Root; + + for (var i = 0; i < depth; i++) + { + var bitPos = depth - 1 - i; + var isSet = (path >> bitPos & 1) == 1; + var next = current.Walk(isSet); + + if (next is null) + { + next = new Node(); + + if (i == depth - 1) + { + next.Value = value; + next.IsLeaf = true; + } + + if (path == 0) + { + next.CanBeFill = true; + } + + current.Set(isSet, next); + } + else if (next.IsLeaf) + { + throw new InvalidOperationException("node is leaf, no other following"); + } + + current = next; + } + } + + public void Fill(int depth, int path, Node node) + { + var current = Root; + + for (var i = 0; i < depth; i++) + { + var bitPos = depth - 1 - i; + var isSet = (path >> bitPos & 1) == 1; + var next = current.Walk(isSet); + + if (next is null) + { + if (i == depth - 1) + { + next = node; + } + else + { + next = new Node(); + } + + if (path == 0) + { + next.CanBeFill = true; + } + + current.Set(isSet, next); + } + else if (next.IsLeaf) + { + throw new InvalidOperationException("node is leaf, no other following"); + } + + current = next; + } + } + } + + private static readonly short[][] BLACK_CODES = new short[][] { + new short[]{ // 2 bits + 0x2, 0x3, + }, + new short[]{ // 3 bits + 0x2, 0x3, + }, + new short[]{ // 4 bits + 0x2, 0x3, + }, + new short[]{ // 5 bits + 0x3, + }, + new short[]{ // 6 bits + 0x4, 0x5, + }, + new short[]{ // 7 bits + 0x4, 0x5, 0x7, + }, + new short[]{ // 8 bits + 0x4, 0x7, + }, + new short[]{ // 9 bits + 0x18, + }, + new short[]{ // 10 bits + 0x17, 0x18, 0x37, 0x8, 0xf, + }, + new short[]{ // 11 bits + 0x17, 0x18, 0x28, 0x37, 0x67, 0x68, 0x6c, 0x8, 0xc, 0xd, + }, + new short[]{ // 12 bits + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f, 0x24, 0x27, 0x28, 0x2b, 0x2c, 0x33, + 0x34, 0x35, 0x37, 0x38, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xd2, 0xd3, + 0xd4, 0xd5, 0xd6, 0xd7, 0xda, 0xdb, + }, + new short[]{ // 13 bits + 0x4a, 0x4b, 0x4c, 0x4d, 0x52, 0x53, 0x54, 0x55, 0x5a, 0x5b, 0x64, 0x65, 0x6c, 0x6d, 0x72, 0x73, + 0x74, 0x75, 0x76, 0x77, + } + }; + + private static readonly short[][] BLACK_RUN_LENGTHS = new short[][]{ + new short[]{ // 2 bits + 3, 2, + }, + new short[]{ // 3 bits + 1, 4, + }, + new short[]{ // 4 bits + 6, 5, + }, + new short[]{ // 5 bits + 7, + }, + new short[]{ // 6 bits + 9, 8, + }, + new short[]{ // 7 bits + 10, 11, 12, + }, + new short[]{ // 8 bits + 13, 14, + }, + new short[]{ // 9 bits + 15, + }, + new short[]{ // 10 bits + 16, 17, 0, 18, 64, + }, + new short[]{ // 11 bits + 24, 25, 23, 22, 19, 20, 21, 1792, 1856, 1920, + }, + new short[]{ // 12 bits + 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 52, 55, 56, 59, 60, 320, 384, 448, 53, + 54, 50, 51, 44, 45, 46, 47, 57, 58, 61, 256, 48, 49, 62, 63, 30, 31, 32, 33, 40, 41, 128, 192, 26, + 27, 28, 29, 34, 35, 36, 37, 38, 39, 42, 43, + }, + new short[]{ // 13 bits + 640, 704, 768, 832, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 512, 576, 896, 960, 1024, 1088, + 1152, 1216, + } + }; + + private static readonly short[][] WHITE_CODES = new short[][]{ + new short[]{ // 4 bits + 0x7, 0x8, 0xb, 0xc, 0xe, 0xf, + }, + new short[]{ // 5 bits + 0x12, 0x13, 0x14, 0x1b, 0x7, 0x8, + }, + new short[]{ // 6 bits + 0x17, 0x18, 0x2a, 0x2b, 0x3, 0x34, 0x35, 0x7, 0x8, + }, + new short[]{ // 7 bits + 0x13, 0x17, 0x18, 0x24, 0x27, 0x28, 0x2b, 0x3, 0x37, 0x4, 0x8, 0xc, + }, + new short[]{ // 8 bits + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1a, 0x1b, 0x2, 0x24, 0x25, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, + 0x3, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x4, 0x4a, 0x4b, 0x5, 0x52, 0x53, 0x54, 0x55, 0x58, 0x59, + 0x5a, 0x5b, 0x64, 0x65, 0x67, 0x68, 0xa, 0xb, + }, + new short[]{ // 9 bits + 0x98, 0x99, 0x9a, 0x9b, 0xcc, 0xcd, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, + }, + new short[]{ // 10 bits + }, + new short[]{ // 11 bits + 0x8, 0xc, 0xd, + }, + new short[]{ // 12 bits + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f, + } + }; + + private static readonly short[][] WHITE_RUN_LENGTHS = new short[][]{ + new short[]{ // 4 bits + 2, 3, 4, 5, 6, 7, + }, + new short[]{ // 5 bits + 128, 8, 9, 64, 10, 11, + }, + new short[]{ // 6 bits + 192, 1664, 16, 17, 13, 14, 15, 1, 12, + }, + new short[]{ // 7 bits + 26, 21, 28, 27, 18, 24, 25, 22, 256, 23, 20, 19, + }, + new short[]{ // 8 bits + 33, 34, 35, 36, 37, 38, 31, 32, 29, 53, 54, 39, 40, 41, 42, 43, 44, 30, 61, 62, 63, 0, 320, 384, 45, + 59, 60, 46, 49, 50, 51, 52, 55, 56, 57, 58, 448, 512, 640, 576, 47, 48, + }, + new short[]{ // 9 bits + 1472, 1536, 1600, 1728, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, + }, + new short[]{ // 10 bits + }, + new short[]{ // 11 bits + 1792, 1856, 1920, + }, + new short[]{ // 12 bits + 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, + } + }; + + private static readonly Node EOL; + private static readonly Node FILL; + private static readonly Tree BlackRunTree; + private static readonly Tree WhiteRunTree; + private static readonly Tree EolOnlyTree; + private static readonly Tree CodeTree; + + const int VALUE_EOL = -2000; + const int VALUE_FILL = -1000; + const int VALUE_PASSMODE = -3000; + const int VALUE_HMODE = -4000; + + static CcittFaxDecoderStream() + { + EOL = new Node + { + IsLeaf = true, + Value = VALUE_EOL + }; + FILL = new Node + { + Value = VALUE_FILL + }; + FILL.Left = FILL; + FILL.Right = EOL; + + EolOnlyTree = new Tree(); + EolOnlyTree.Fill(12, 0, FILL); + EolOnlyTree.Fill(12, 1, EOL); + + BlackRunTree = new Tree(); + for (var i = 0; i < BLACK_CODES.Length; i++) + { + for (var j = 0; j < BLACK_CODES[i].Length; j++) + { + BlackRunTree.Fill(i + 2, BLACK_CODES[i][j], BLACK_RUN_LENGTHS[i][j]); + } + } + BlackRunTree.Fill(12, 0, FILL); + BlackRunTree.Fill(12, 1, EOL); + + WhiteRunTree = new Tree(); + + for (var i = 0; i < WHITE_CODES.Length; i++) + { + for (var j = 0; j < WHITE_CODES[i].Length; j++) + { + WhiteRunTree.Fill(i + 4, WHITE_CODES[i][j], WHITE_RUN_LENGTHS[i][j]); + } + } + + WhiteRunTree.Fill(12, 0, FILL); + WhiteRunTree.Fill(12, 1, EOL); + + CodeTree = new Tree(); + CodeTree.Fill(4, 1, VALUE_PASSMODE); // pass mode + CodeTree.Fill(3, 1, VALUE_HMODE); // H mode + CodeTree.Fill(1, 1, 0); // V(0) + CodeTree.Fill(3, 3, 1); // V_R(1) + CodeTree.Fill(6, 3, 2); // V_R(2) + CodeTree.Fill(7, 3, 3); // V_R(3) + CodeTree.Fill(3, 2, -1); // V_L(1) + CodeTree.Fill(6, 2, -2); // V_L(2) + CodeTree.Fill(7, 2, -3); // V_L(3) + } + } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs index f0520ecd2..21d4d8565 100644 --- a/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs @@ -3,14 +3,16 @@ using System; using System.IO; using Tokens; + using CcittFax; using Util; /// /// Decodes image data that has been encoded using either Group 3 or Group 4. - /// + /// /// Ported from https://github.com/apache/pdfbox/blob/714156a15ea6fcfe44ac09345b01e192cbd74450/pdfbox/src/main/java/org/apache/pdfbox/filter/CCITTFaxFilter.java + /// /// - internal sealed class CcittFaxDecodeFilter : IFilter + public sealed class CcittFaxDecodeFilter : IFilter { /// public bool IsSupported { get; } = true; diff --git a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs index 818604961..4dd158db3 100644 --- a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs @@ -3,7 +3,11 @@ using System; using Tokens; - internal sealed class DctDecodeFilter : IFilter + /// + /// DST (Discrete Cosine Transform) Filter indicates data is encoded in JPEG format. + /// This filter is not implemented and will not be used during parsing. + /// + public sealed class DctDecodeFilter : IFilter { /// public bool IsSupported { get; } = false; diff --git a/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs b/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs index f6049216a..0c69890e4 100644 --- a/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs +++ b/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs @@ -1,26 +1,24 @@ namespace UglyToad.PdfPig.Filters { - using System; using System.Collections.Generic; - using System.Linq; - using Core; using Tokens; - using UglyToad.PdfPig.Util; - /// /// /// The default implementation of the . /// - public class DefaultFilterProvider : IFilterProvider + public sealed class DefaultFilterProvider : BaseFilterProvider { - private readonly IReadOnlyDictionary filterInstances; - /// /// The single instance of this provider. /// public static readonly IFilterProvider Instance = new DefaultFilterProvider(); - private DefaultFilterProvider() + /// + private DefaultFilterProvider() : base(GetDictionary()) + { + } + + private static Dictionary GetDictionary() { var ascii85 = new Ascii85Filter(); var asciiHex = new AsciiHexDecodeFilter(); @@ -32,93 +30,25 @@ private DefaultFilterProvider() var runLength = new RunLengthFilter(); var lzw = new LzwFilter(); - filterInstances = new Dictionary + return new Dictionary { - {NameToken.Ascii85Decode.Data, ascii85}, - {NameToken.Ascii85DecodeAbbreviation.Data, ascii85}, - {NameToken.AsciiHexDecode.Data, asciiHex}, - {NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex}, - {NameToken.CcittfaxDecode.Data, ccitt}, - {NameToken.CcittfaxDecodeAbbreviation.Data, ccitt}, - {NameToken.DctDecode.Data, dct}, - {NameToken.DctDecodeAbbreviation.Data, dct}, - {NameToken.FlateDecode.Data, flate}, - {NameToken.FlateDecodeAbbreviation.Data, flate}, - {NameToken.Jbig2Decode.Data, jbig2}, - {NameToken.JpxDecode.Data, jpx}, - {NameToken.RunLengthDecode.Data, runLength}, - {NameToken.RunLengthDecodeAbbreviation.Data, runLength}, - {NameToken.LzwDecode, lzw}, - {NameToken.LzwDecodeAbbreviation, lzw} + { NameToken.Ascii85Decode.Data, ascii85 }, + { NameToken.Ascii85DecodeAbbreviation.Data, ascii85 }, + { NameToken.AsciiHexDecode.Data, asciiHex }, + { NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex }, + { NameToken.CcittfaxDecode.Data, ccitt }, + { NameToken.CcittfaxDecodeAbbreviation.Data, ccitt }, + { NameToken.DctDecode.Data, dct }, + { NameToken.DctDecodeAbbreviation.Data, dct }, + { NameToken.FlateDecode.Data, flate }, + { NameToken.FlateDecodeAbbreviation.Data, flate }, + { NameToken.Jbig2Decode.Data, jbig2 }, + { NameToken.JpxDecode.Data, jpx }, + { NameToken.RunLengthDecode.Data, runLength }, + { NameToken.RunLengthDecodeAbbreviation.Data, runLength }, + { NameToken.LzwDecode.Data, lzw }, + { NameToken.LzwDecodeAbbreviation.Data, lzw } }; } - - /// - public IReadOnlyList GetFilters(DictionaryToken dictionary) - { - if (dictionary is null) - { - throw new ArgumentNullException(nameof(dictionary)); - } - - var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F); - if (token is null) - { - return Array.Empty(); - } - - switch (token) - { - case ArrayToken filters: - var result = new IFilter[filters.Data.Count]; - for (var i = 0; i < filters.Data.Count; i++) - { - var filterToken = filters.Data[i]; - var filterName = ((NameToken) filterToken).Data; - result[i] = GetFilterStrict(filterName); - } - - return result; - case NameToken name: - return new[] { GetFilterStrict(name.Data) }; - default: - throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}."); - } - } - - /// - public IReadOnlyList GetNamedFilters(IReadOnlyList names) - { - if (names is null) - { - throw new ArgumentNullException(nameof(names)); - } - - var result = new List(); - - foreach (var name in names) - { - result.Add(GetFilterStrict(name)); - } - - return result; - } - - private IFilter GetFilterStrict(string name) - { - if (!filterInstances.TryGetValue(name, out var factory)) - { - throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue."); - } - - return factory; - } - - /// - public IReadOnlyList GetAllFilters() - { - return filterInstances.Values.Distinct().ToList(); - } - } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/FlateFilter.cs b/src/UglyToad.PdfPig/Filters/FlateFilter.cs index e609a348c..35099c926 100644 --- a/src/UglyToad.PdfPig/Filters/FlateFilter.cs +++ b/src/UglyToad.PdfPig/Filters/FlateFilter.cs @@ -7,7 +7,6 @@ using Tokens; using Util; - /// /// /// The Flate filter is based on the public-domain zlib/deflate compression method, a variable-length Lempel-Ziv /// adaptive compression method cascaded with adaptive Huffman coding. @@ -18,7 +17,7 @@ /// See section 3.3.3 of the spec (version 1.7) for details on the FlateDecode filter. /// The flate decode filter may have a predictor function to further compress the stream. /// - internal sealed class FlateFilter : IFilter + public sealed class FlateFilter : IFilter { // Defaults are from table 3.7 in the spec (version 1.7) private const int DefaultColors = 1; @@ -86,6 +85,7 @@ private static byte[] Decompress(byte[] input) } } + /// public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index) { const int headerLength = 2; diff --git a/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs index 7850a74b7..f32400433 100644 --- a/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs @@ -3,7 +3,11 @@ using System; using Tokens; - internal sealed class Jbig2DecodeFilter : IFilter + /// + /// JBIG2 Filter for monochrome image data. + /// This filter is not implemented and will not be used during parsing. + /// + public sealed class Jbig2DecodeFilter : IFilter { /// public bool IsSupported { get; } = false; diff --git a/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs index 984bf6cd4..24d9dd81a 100644 --- a/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs @@ -3,7 +3,11 @@ using System; using Tokens; - internal sealed class JpxDecodeFilter : IFilter + /// + /// JPX Filter (JPEG2000) for image data. + /// This filter is not implemented and will not be used during parsing. + /// + public sealed class JpxDecodeFilter : IFilter { /// public bool IsSupported { get; } = false; diff --git a/src/UglyToad.PdfPig/Filters/BitStream.cs b/src/UglyToad.PdfPig/Filters/Lzw/BitStream.cs similarity index 92% rename from src/UglyToad.PdfPig/Filters/BitStream.cs rename to src/UglyToad.PdfPig/Filters/Lzw/BitStream.cs index f729e5701..239c6bbca 100644 --- a/src/UglyToad.PdfPig/Filters/BitStream.cs +++ b/src/UglyToad.PdfPig/Filters/Lzw/BitStream.cs @@ -1,4 +1,4 @@ -namespace UglyToad.PdfPig.Filters +namespace UglyToad.PdfPig.Filters.Lzw { using System; @@ -53,9 +53,9 @@ public int Get(int numberOfBits) } // 'And' out the leading bits. - var firstBitOfDataWithinInt = (sizeof(int) * 8) - numberOfBits; + var firstBitOfDataWithinInt = sizeof(int) * 8 - numberOfBits; result &= (int)(0xffffffff >> firstBitOfDataWithinInt); - + currentWithinByteBitOffset = endWithinByteBitOffset; return result; diff --git a/src/UglyToad.PdfPig/Filters/LzwFilter.cs b/src/UglyToad.PdfPig/Filters/LzwFilter.cs index eaaae251b..bd086cd50 100644 --- a/src/UglyToad.PdfPig/Filters/LzwFilter.cs +++ b/src/UglyToad.PdfPig/Filters/LzwFilter.cs @@ -4,15 +4,15 @@ namespace UglyToad.PdfPig.Filters { using System; using System.Collections.Generic; + using Lzw; using Tokens; using Util; - /// /// /// The LZW (Lempel-Ziv-Welch) filter is a variable-length, adaptive compression method /// that has been adopted as one of the standard compression methods in the Tag Image File Format (TIFF) standard. /// - internal sealed class LzwFilter : IFilter + public sealed class LzwFilter : IFilter { private const int DefaultColors = 1; private const int DefaultBitsPerComponent = 8; diff --git a/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs b/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs index 6e8845d3f..1a4d102f0 100644 --- a/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs +++ b/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs @@ -2,14 +2,13 @@ { using System; using Tokens; - using UglyToad.PdfPig.Core; + using Core; - /// /// - /// The Run Length filterencodes data in a simple byte-oriented format based on run length. + /// The Run Length filter encodes data in a simple byte-oriented format based on run length. /// The encoded data is a sequence of runs, where each run consists of a length byte followed by 1 to 128 bytes of data. /// - internal sealed class RunLengthFilter : IFilter + public sealed class RunLengthFilter : IFilter { private const byte EndOfDataLength = 128; diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 7e5d6ca1d..8cf852927 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -106,7 +106,7 @@ private static PdfDocument OpenDocument( ISeekableTokenScanner scanner, ParsingOptions parsingOptions) { - var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance); + var filterProvider = new FilterProviderWithLookup(parsingOptions.FilterProvider ?? DefaultFilterProvider.Instance); CrossReferenceTable? crossReferenceTable = null; diff --git a/src/UglyToad.PdfPig/ParsingOptions.cs b/src/UglyToad.PdfPig/ParsingOptions.cs index d509bb00c..c4191c411 100644 --- a/src/UglyToad.PdfPig/ParsingOptions.cs +++ b/src/UglyToad.PdfPig/ParsingOptions.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig { + using Filters; using System.Collections.Generic; using Logging; @@ -50,5 +51,10 @@ public sealed class ParsingOptions /// forms and images when missing. /// public bool SkipMissingFonts { get; set; } = false; + + /// + /// Filter provider to use while parsing the document. The will be used if set to null. + /// + public IFilterProvider? FilterProvider { get; set; } = null; } } \ No newline at end of file