diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index 6af787324..d2538079f 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -50,7 +50,15 @@ public static IEnumerable> NearestNeighbours(IReadOnlyList kdTree = new KdTree(elements, candidatesPoint); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; @@ -118,7 +126,15 @@ public static IEnumerable> NearestNeighbours(IReadOnlyList kdTree = new KdTree(elements, candidatesPoint); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; @@ -186,7 +202,15 @@ public static IEnumerable> NearestNeighbours(IReadOnlyList /// Contains helpful tools for distance measures. @@ -169,12 +168,11 @@ public static int FindIndexNearest(T element, IReadOnlyList candidates, distance = double.MaxValue; int closestPointIndex = -1; - var candidatesPoints = candidates.Select(candidatePoint).ToList(); var pivot = pivotPoint(element); for (var i = 0; i < candidates.Count; i++) { - double currentDistance = distanceMeasure(pivot, candidatesPoints[i]); + double currentDistance = distanceMeasure(pivot, candidatePoint(candidates[i])); if (currentDistance < distance && !candidates[i].Equals(element)) { distance = currentDistance; @@ -211,12 +209,11 @@ public static int FindIndexNearest(T element, IReadOnlyList candidates, distance = double.MaxValue; int closestLineIndex = -1; - var candidatesLines = candidates.Select(candidateLine).ToList(); var pivot = pivotLine(element); for (var i = 0; i < candidates.Count; i++) { - double currentDistance = distanceMeasure(pivot, candidatesLines[i]); + double currentDistance = distanceMeasure(pivot, candidateLine(candidates[i])); if (currentDistance < distance && !candidates[i].Equals(element)) { distance = currentDistance; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs index ed7dc87b1..f26b48b98 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Content; - using UglyToad.PdfPig.PdfFonts; /// /// Checks if each letter is a duplicate and overlaps any other letter and remove the duplicate, and flag the remaining as bold. @@ -24,18 +23,17 @@ public static IReadOnlyList Get(IEnumerable letters) return letters?.ToList(); } - var queue = new Queue(letters); - var cleanLetters = new List() { queue.Dequeue() }; // dequeue the first letter + // Use a dictionary keyed by (Value, FontName) to look up candidate duplicates in O(1) + var duplicateIndex = new Dictionary<(string, string), List>(); + var cleanLetters = new List(); - while (queue.Count > 0) + foreach (var letter in letters) { - var letter = queue.Dequeue(); bool addLetter = true; int duplicatesOverlappingIndex = -1; - var duplicates = cleanLetters.Where(l => l.Value.Equals(letter.Value) && l.FontName.Equals(letter.FontName)); // do other checks? - - if (duplicates.Any()) + var key = (letter.Value, letter.FontName); + if (duplicateIndex.TryGetValue(key, out var candidateIndices)) { double tolerance = letter.GlyphRectangle.Width / (letter.Value.Length == 0 ? 1 : letter.Value.Length) / 3.0; double minX = letter.GlyphRectangle.BottomLeft.X - tolerance; @@ -43,22 +41,33 @@ public static IReadOnlyList Get(IEnumerable letters) double minY = letter.GlyphRectangle.BottomLeft.Y - tolerance; double maxY = letter.GlyphRectangle.BottomLeft.Y + tolerance; - var duplicatesOverlapping = duplicates.FirstOrDefault(l => minX <= l.GlyphRectangle.BottomLeft.X && - maxX >= l.GlyphRectangle.BottomLeft.X && - minY <= l.GlyphRectangle.BottomLeft.Y && - maxY >= l.GlyphRectangle.BottomLeft.Y); - - if (duplicatesOverlapping != default) + for (int ci = 0; ci < candidateIndices.Count; ci++) { - // duplicate overlapping letter was found, keeping the existing one and not adding this one. - addLetter = false; - duplicatesOverlappingIndex = cleanLetters.IndexOf(duplicatesOverlapping); + int idx = candidateIndices[ci]; + var l = cleanLetters[idx]; + if (minX <= l.GlyphRectangle.BottomLeft.X && + maxX >= l.GlyphRectangle.BottomLeft.X && + minY <= l.GlyphRectangle.BottomLeft.Y && + maxY >= l.GlyphRectangle.BottomLeft.Y) + { + addLetter = false; + duplicatesOverlappingIndex = idx; + break; + } } } if (addLetter) { + int newIndex = cleanLetters.Count; cleanLetters.Add(letter); + + if (!duplicateIndex.TryGetValue(key, out var list)) + { + list = new List(); + duplicateIndex[key] = list; + } + list.Add(newIndex); } else if (duplicatesOverlappingIndex != -1) { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs index b6fbf07c2..2af59319e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs @@ -54,9 +54,6 @@ public PdfPoint FindNearestNeighbour(PdfPoint pivot, Func public class KdTree { - private readonly KdTreeComparerY kdTreeComparerY = new KdTreeComparerY(); - private readonly KdTreeComparerX kdTreeComparerX = new KdTreeComparerX(); - /// /// The root of the tree. /// @@ -111,11 +108,11 @@ private KdTreeNode BuildTree(Span> P, int depth = 0) if (depth % 2 == 0) { - P.Sort(kdTreeComparerX); + P.Sort((p0, p1) => p0.Value.X.CompareTo(p1.Value.X)); } else { - P.Sort(kdTreeComparerY); + P.Sort((p0, p1) => p0.Value.Y.CompareTo(p1.Value.Y)); } if (P.Length == 2) @@ -131,6 +128,26 @@ private KdTreeNode BuildTree(Span> P, int depth = 0) return new KdTreeNode(vLeft, vRight, P[median], depth); } #else + private sealed class KdTreeComparerY : IComparer> + { + public static readonly KdTreeComparerY Shared = new KdTreeComparerY(); + + public int Compare(KdTreeElement p0, KdTreeElement p1) + { + return p0.Value.Y.CompareTo(p1.Value.Y); + } + } + + private sealed class KdTreeComparerX : IComparer> + { + public static readonly KdTreeComparerX Shared = new KdTreeComparerX(); + + public int Compare(KdTreeElement p0, KdTreeElement p1) + { + return p0.Value.X.CompareTo(p1.Value.X); + } + } + private KdTreeNode BuildTree(ArraySegment> P, int depth = 0) { if (P.Count == 0) @@ -145,13 +162,13 @@ private KdTreeNode BuildTree(ArraySegment> P, int depth = 0) if (depth % 2 == 0) { - P.Sort(kdTreeComparerX); + P.Sort(KdTreeComparerX.Shared); } else { - P.Sort(kdTreeComparerY); + P.Sort(KdTreeComparerY.Shared); } - + if (P.Count == 2) { return new KdTreeNode(new KdTreeLeaf(P.GetAt(0), depth + 1), null, P.GetAt(1), depth); @@ -179,13 +196,14 @@ private KdTreeNode BuildTree(ArraySegment> P, int depth = 0) /// The nearest neighbour's element. public T FindNearestNeighbour(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance) { - var result = FindNearestNeighbour(Root, pivot, pivotPointFunc, distanceMeasure); + var pivotPoint = pivotPointFunc(pivot); + var result = FindNearestNeighbour(Root, pivot, pivotPoint, distanceMeasure); index = result.Item1 != null ? result.Item1.Index : -1; distance = result.Item2 ?? double.NaN; return result.Item1 != null ? result.Item1.Element : default; } - private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance) + private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, T pivot, PdfPoint pivotPoint, Func distance) { if (node == null) { @@ -197,23 +215,22 @@ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, { return (null, null); } - return (node, distance(node.Value, pivotPointFunc(pivot))); + return (node, distance(node.Value, pivotPoint)); } else { - var point = pivotPointFunc(pivot); var currentNearestNode = node; - var currentDistance = distance(node.Value, point); + var currentDistance = distance(node.Value, pivotPoint); KdTreeNode newNode = null; double? newDist = null; - var pointValue = node.IsAxisCutX ? point.X : point.Y; + var pointValue = node.IsAxisCutX ? pivotPoint.X : pivotPoint.Y; if (pointValue < node.L) { // start left - (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPoint, distance); if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -223,13 +240,13 @@ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, if (node.RightChild != null && pointValue + currentDistance >= node.L) { - (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPoint, distance); } } else { // start right - (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPoint, distance); if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -239,7 +256,7 @@ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, if (node.LeftChild != null && pointValue - currentDistance <= node.L) { - (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPoint, distance); } } @@ -258,7 +275,7 @@ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, /// /// Get the k nearest neighbours to the pivot element. /// Might return more than k neighbours if points are equidistant. - /// Use if only looking for the (single) closest point. + /// Use if only looking for the (single) closest point. /// /// The element for which to find the k nearest neighbours. /// The number of neighbours to return. Might return more than k neighbours if points are equidistant. @@ -267,13 +284,24 @@ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, /// Returns a list of tuples of the k nearest neighbours. Tuples are (element, index, distance). public IReadOnlyList<(T, int, double)> FindNearestNeighbours(T pivot, int k, Func pivotPointFunc, Func distanceMeasure) { + var pivotPoint = pivotPointFunc(pivot); var kdTreeNodes = new KNearestNeighboursQueue(k); - FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes); - return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToArray(); + FindNearestNeighbours(Root, pivot, k, pivotPoint, distanceMeasure, kdTreeNodes); + + var results = new List<(T, int, double)>(); + for (int i = 0; i < kdTreeNodes.Count; i++) + { + double dist = kdTreeNodes.Keys[i]; + foreach (var e in kdTreeNodes.Values[i]) + { + results.Add((e.Element, e.Index, dist)); + } + } + return results; } private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, T pivot, int k, - Func pivotPointFunc, Func distance, KNearestNeighboursQueue queue) + PdfPoint pivotPoint, Func distance, KNearestNeighboursQueue queue) { if (node == null) { @@ -286,7 +314,7 @@ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, return (null, double.NaN); } - var currentDistance = distance(node.Value, pivotPointFunc(pivot)); + var currentDistance = distance(node.Value, pivotPoint); var currentNearestNode = node; if (!queue.IsFull || currentDistance <= queue.LastDistance) @@ -300,9 +328,8 @@ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, } else { - var point = pivotPointFunc(pivot); var currentNearestNode = node; - var currentDistance = distance(node.Value, point); + var currentDistance = distance(node.Value, pivotPoint); if ((!queue.IsFull || currentDistance <= queue.LastDistance) && !node.Element.Equals(pivot)) { queue.Add(currentDistance, currentNearestNode); @@ -313,12 +340,12 @@ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, KdTreeNode newNode = null; double newDist = double.NaN; - var pointValue = node.IsAxisCutX ? point.X : point.Y; + var pointValue = node.IsAxisCutX ? pivotPoint.X : pivotPoint.Y; if (pointValue < node.L) { // start left - (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue); + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPoint, distance, queue); if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -329,13 +356,13 @@ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, if (node.RightChild != null && pointValue + currentDistance >= node.L) { - (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue); + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPoint, distance, queue); } } else { // start right - (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue); + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPoint, distance, queue); if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -346,7 +373,7 @@ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, if (node.LeftChild != null && pointValue - currentDistance <= node.L) { - (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue); + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPoint, distance, queue); } } @@ -395,9 +422,14 @@ public void Add(double key, KdTreeNode value) if (this[key].Add(value)) { - var last = this.Last(); - LastElement = last.Value.Last(); - LastDistance = last.Key; + LastDistance = Keys[Count - 1]; + var lastSet = Values[Count - 1]; + KdTreeNode lastElement = null; + foreach (var e in lastSet) + { + lastElement = e; + } + LastElement = lastElement; } } } @@ -418,23 +450,7 @@ internal KdTreeElement(int index, PdfPoint point, R value) public R Element { get; } } - - private sealed class KdTreeComparerY : IComparer> - { - public int Compare(KdTreeElement p0, KdTreeElement p1) - { - return p0.Value.Y.CompareTo(p1.Value.Y); - } - } - - private sealed class KdTreeComparerX : IComparer> - { - public int Compare(KdTreeElement p0, KdTreeElement p1) - { - return p0.Value.X.CompareTo(p1.Value.X); - } - } - + /// /// K-D tree leaf. /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 56bbef6eb..e9e1444ec 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -3,7 +3,6 @@ using Content; using Core; using System; - using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; @@ -154,57 +153,72 @@ public static bool GetSpacingEstimation(IReadOnlyList words, { ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; - var withinLineDistList = new ConcurrentBag(); - var betweenLineDistList = new ConcurrentBag(); + var withinLineDistList = new List(); + var betweenLineDistList = new List(); // 1. Estimate within line and between line spacing KdTree kdTreeBottomLeft = new KdTree(words, w => w.BoundingBox.BottomLeft); - Parallel.For(0, words.Count, parallelOptions, i => - { - var word = words[i]; - - // Within-line distance - // 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance. - foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean)) + Parallel.For(0, words.Count, parallelOptions, + () => (wl: new List(), bl: new List()), + (i, _, local) => { - // 1.1.2 Check if the neighbour word is within the angle of the candidate - if (wlBounds.Contains(AngleWL(word, n.Item1))) - { - withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)); - } - } + var word = words[i]; - // Between-line distance - // 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance. - foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean)) - { - // 1.2.2 Check if the candidate words is within the angle - var angle = AngleBL(word, n.Item1); - if (blBounds.Contains(angle)) + // Within-line distance + // 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance. + foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean)) { - // 1.2.3 Compute the vertical (between-line) distance between the candidate - // and the neighbour and add it to the between-line distances list - double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid); - - // Angle is kept within [-90, 90] - if (angle > 90) + // 1.1.2 Check if the neighbour word is within the angle of the candidate + if (wlBounds.Contains(AngleWL(word, n.Item1))) { - angle -= 180; + local.wl.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)); } + } - var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180)) - - word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0; - - // The perpendicular distance can be negative because of the subtractions. - // Could occur when words are overlapping, we ignore that. - if (dist >= 0) + // Between-line distance + // 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance. + foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean)) + { + // 1.2.2 Check if the candidate words is within the angle + var angle = AngleBL(word, n.Item1); + if (blBounds.Contains(angle)) { - betweenLineDistList.Add(dist); + // 1.2.3 Compute the vertical (between-line) distance between the candidate + // and the neighbour and add it to the between-line distances list + double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid); + + // Angle is kept within [-90, 90] + if (angle > 90) + { + angle -= 180; + } + + var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180)) + - word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0; + + // The perpendicular distance can be negative because of the subtractions. + // Could occur when words are overlapping, we ignore that. + if (dist >= 0) + { + local.bl.Add(dist); + } } } - } - }); + + return local; + }, + local => + { + lock (withinLineDistList) + { + withinLineDistList.AddRange(local.wl); + } + lock (betweenLineDistList) + { + betweenLineDistList.AddRange(local.bl); + } + }); // Compute average peak value of distribution double? withinLinePeak = GetPeakAverageDistance(withinLineDistList, wlBinSize); @@ -221,9 +235,9 @@ public static bool GetSpacingEstimation(IReadOnlyList words, /// /// The set of distances to average. /// - private static double? GetPeakAverageDistance(IEnumerable distances, int binLength = 1) + private static double? GetPeakAverageDistance(List distances, int binLength = 1) { - if (!distances.Any()) + if (distances.Count == 0) { return null; } @@ -233,7 +247,16 @@ public static bool GetSpacingEstimation(IReadOnlyList words, throw new ArgumentException("DocstrumBoundingBoxes: the bin length must be positive when commputing peak average distance.", nameof(binLength)); } - double maxDbl = Math.Ceiling(distances.Max()); + double maxDbl = distances[0]; + for (int i = 1; i < distances.Count; i++) + { + if (distances[i] > maxDbl) + { + maxDbl = distances[i]; + } + } + + maxDbl = Math.Ceiling(maxDbl); if (maxDbl > int.MaxValue) { throw new OverflowException($"Error while casting maximum distance of {maxDbl} to integer."); @@ -249,30 +272,49 @@ public static bool GetSpacingEstimation(IReadOnlyList words, binLength = binLength > max ? max : binLength; } - var bins = Enumerable.Range(0, (int)Math.Ceiling(max / (double)binLength) + 1) - .Select(x => x * binLength) - .ToDictionary(x => x, _ => new List()); + int binCount = (int)Math.Ceiling(max / (double)binLength) + 1; + var bins = new List[binCount]; + for (int i = 0; i < binCount; i++) + { + bins[i] = new List(); + } - foreach (var distance in distances) + for (int i = 0; i < distances.Count; i++) { + var distance = distances[i]; int bin = (int)Math.Floor(distance / binLength); if (bin < 0) { throw new ArgumentOutOfRangeException(nameof(bin), "DocstrumBoundingBoxes: Negative distance found while commputing peak average distance."); } - bins[bins.Keys.ElementAt(bin)].Add(distance); + if (bin >= binCount) + { + bin = binCount - 1; + } + bins[bin].Add(distance); } - var best = default(List); - foreach (var bin in bins) + List best = null; + for (int i = 0; i < binCount; i++) { - if (best == null || bin.Value.Count > best.Count) + var bin = bins[i]; + if (best == null || bin.Count > best.Count) { - best = bin.Value; + best = bin; } } - return best?.Average(); + if (best == null || best.Count == 0) + { + return null; + } + + double sum = 0; + for (int i = 0; i < best.Count; i++) + { + sum += best[i]; + } + return sum / best.Count; } #endregion @@ -417,28 +459,7 @@ private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine li return double.PositiveInfinity; } } - - private sealed class PdfPointXYComparer : IComparer - { - public static readonly PdfPointXYComparer Instance = new(); - - public int Compare(PdfPoint p1, PdfPoint p2) - { - int comp = p1.X.CompareTo(p2.X); - return comp == 0 ? p1.Y.CompareTo(p2.Y) : comp; - } - } - - private sealed class PdfPointYComparer : IComparer - { - public static readonly PdfPointYComparer Instance = new(); - - public int Compare(PdfPoint p1, PdfPoint p2) - { - return p1.Y.CompareTo(p2.Y); - } - } - + /// /// Get the structural blocking parameters. /// @@ -486,22 +507,30 @@ public static bool GetStructuralBlockingParameters(PdfLine i, PdfLine j, double if (dXj != 0) { - ps.Sort(PdfPointXYComparer.Instance); + ps.Sort((p1, p2) => + { + int comp = p1.X.CompareTo(p2.X); + return comp == 0 ? p1.Y.CompareTo(p2.Y) : comp; + }); } else if (dYj != 0) { - ps.Sort(PdfPointYComparer.Instance); + ps.Sort((p1, p2) => p1.Y.CompareTo(p2.Y)); } #else PdfPoint[] ps = [j.Point1, j.Point2, Aj.Value, Bj.Value]; if (dXj != 0) { - Array.Sort(ps, PdfPointXYComparer.Instance); + Array.Sort(ps, (p1, p2) => + { + int comp = p1.X.CompareTo(p2.X); + return comp == 0 ? p1.Y.CompareTo(p2.Y) : comp; + }); } else if (dYj != 0) { - Array.Sort(ps, PdfPointYComparer.Instance); + Array.Sort(ps, (p1, p2) => p1.Y.CompareTo(p2.Y)); } #endif diff --git a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs index 8d2b988ef..a0c567607 100644 --- a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs +++ b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs @@ -222,7 +222,31 @@ public static PdfRectangle MinimumAreaRectangle(PdfPoint[] points) throw new ArgumentException("MinimumAreaRectangle(): points cannot be null and must contain at least one point.", nameof(points)); } - return ParametricPerpendicularProjection(GrahamScan(points.Distinct()).ToArray()); + var distinctSet = new HashSet(points); + var distinctPoints = new PdfPoint[distinctSet.Count]; + distinctSet.CopyTo(distinctPoints); + + var hull = GrahamScan(distinctPoints); + + PdfPoint[] hullArray; + if (hull is PdfPoint[] arr) + { + hullArray = arr; + } + else if (hull is List list) + { +#if NET6_0_OR_GREATER + return ParametricPerpendicularProjection(System.Runtime.InteropServices.CollectionsMarshal.AsSpan(list)); +#else + hullArray = list.ToArray(); +#endif + } + else + { + hullArray = hull.ToArray(); + } + + return ParametricPerpendicularProjection(hullArray); } /// @@ -238,9 +262,16 @@ public static PdfRectangle OrientedBoundingBox(IReadOnlyList points) } // Fitting a line through the points - // to find the orientation (slope) - double x0 = points.Average(p => p.X); - double y0 = points.Average(p => p.Y); + double sumX = 0; + double sumY = 0; + for (int i = 0; i < points.Count; i++) + { + sumX += points[i].X; + sumY += points[i].Y; + } + double x0 = sumX / points.Count; + double y0 = sumY / points.Count; + double sumProduct = 0; double sumDiffSquaredX = 0; @@ -265,11 +296,17 @@ public static PdfRectangle OrientedBoundingBox(IReadOnlyList points) sin, cos, 0, 0, 0, 1); - var transformedPoints = points.Select(p => inverseRotation.Transform(p)).ToArray(); - var aabb = new PdfRectangle(transformedPoints.Min(p => p.X), - transformedPoints.Min(p => p.Y), - transformedPoints.Max(p => p.X), - transformedPoints.Max(p => p.Y)); + var first = inverseRotation.Transform(points[0]); + double minX = first.X, minY = first.Y, maxX = first.X, maxY = first.Y; + for (int i = 1; i < points.Count; i++) + { + var tp = inverseRotation.Transform(points[i]); + if (tp.X < minX) minX = tp.X; + if (tp.Y < minY) minY = tp.Y; + if (tp.X > maxX) maxX = tp.X; + if (tp.Y > maxY) maxY = tp.Y; + } + var aabb = new PdfRectangle(minX, minY, maxX, maxY); // Rotate back the AABB to obtain to oriented bounding box (OBB) var rotateBack = new TransformationMatrix( @@ -286,18 +323,7 @@ public static IReadOnlyCollection GrahamScan(IEnumerable poi { return GrahamScan(points.ToArray()); } - - private sealed class PdfPointXYComparer : IComparer - { - public static readonly PdfPointXYComparer Instance = new(); - - public int Compare(PdfPoint p1, PdfPoint p2) - { - int comp = p1.X.CompareTo(p2.X); - return comp == 0 ? p1.Y.CompareTo(p2.Y) : comp; - } - } - + /// /// Algorithm to find the convex hull of the set of points with time complexity O(n log n). /// @@ -320,33 +346,40 @@ static double polarAngle(in PdfPoint point1, in PdfPoint point2) return Math.Atan2(point2.Y - point1.Y, point2.X - point1.X) % Math.PI; } - Array.Sort(points, PdfPointXYComparer.Instance); + Array.Sort(points, (p1, p2) => + { + int comp = p1.X.CompareTo(p2.X); + return comp == 0 ? p1.Y.CompareTo(p2.Y) : comp; + }); var P0 = points[0]; var groups = points.Skip(1).GroupBy(p => polarAngle(P0, p)).OrderBy(g => g.Key).ToArray(); - var sortedPoints = ArrayPool.Shared.Rent(groups.Length); + PdfPoint[]? buffer = null; + var sortedPoints = groups.Length <= 64 + ? stackalloc PdfPoint[groups.Length] + : buffer = ArrayPool.Shared.Rent(groups.Length); try { for (int i = 0; i < groups.Length; i++) { var group = groups[i]; - if (group.Count() == 1) - { - sortedPoints[i] = group.First(); - } - else + + PdfPoint farthest = default; + double maxDistSq = -1; + foreach (var p in group) { - // if more than one point has the same angle, - // remove all but the one that is farthest from P0 - sortedPoints[i] = group.OrderByDescending(p => + double dx = p.X - P0.X; + double dy = p.Y - P0.Y; + double distSq = dx * dx + dy * dy; + if (distSq > maxDistSq) { - double dx = p.X - P0.X; - double dy = p.Y - P0.Y; - return dx * dx + dy * dy; - }).First(); + maxDistSq = distSq; + farthest = p; + } } + sortedPoints[i] = farthest; } if (groups.Length < 2) @@ -354,27 +387,32 @@ static double polarAngle(in PdfPoint point1, in PdfPoint point2) return [P0, sortedPoints[0]]; } - var stack = new Stack(); - stack.Push(P0); - stack.Push(sortedPoints[0]); - stack.Push(sortedPoints[1]); + var hull = new List(groups.Length + 1); + hull.Add(P0); + hull.Add(sortedPoints[0]); + hull.Add(sortedPoints[1]); - for (int i = 2; i < groups.Length; i++) + for (int i = 2; i < groups.Length; ++i) { var point = sortedPoints[i]; - while (stack.Count > 1 && !ccw(stack.ElementAt(1), stack.Peek(), point)) + while (hull.Count > 1 && !ccw(hull[hull.Count - 2], hull[hull.Count - 1], point)) { - stack.Pop(); + hull.RemoveAt(hull.Count - 1); } - stack.Push(point); + hull.Add(point); } - return stack; + hull.Reverse(); + + return hull; } finally { - ArrayPool.Shared.Return(sortedPoints); + if (buffer is not null) + { + ArrayPool.Shared.Return(buffer); + } } } diff --git a/tools/UglyToad.PdfPig.Benchmarks/LayoutAnalysisBenchmarks.cs b/tools/UglyToad.PdfPig.Benchmarks/LayoutAnalysisBenchmarks.cs new file mode 100644 index 000000000..df4eeef48 --- /dev/null +++ b/tools/UglyToad.PdfPig.Benchmarks/LayoutAnalysisBenchmarks.cs @@ -0,0 +1,42 @@ +using BenchmarkDotNet.Attributes; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + +namespace UglyToad.PdfPig.Benchmarks; + +[Config(typeof(NuGetPackageConfig))] +[MemoryDiagnoser(displayGenColumns: false)] +public class LayoutAnalysisBenchmarks +{ + private readonly Letter[] _letters; + private readonly Word[] _words; + + public LayoutAnalysisBenchmarks() + { + using (var doc = PdfDocument.Open("fseprd1102849.pdf")) + { + _letters = doc.GetPage(1).Letters.ToArray(); + _words = NearestNeighbourWordExtractor.Instance.GetWords(_letters).ToArray(); + } + } + + [Benchmark] + public IReadOnlyList GetWords_NearestNeighbourWord() + { + return NearestNeighbourWordExtractor.Instance.GetWords(_letters).ToArray(); + } + + [Benchmark] + public IReadOnlyList GetBlocks_Docstrum() + { + return DocstrumBoundingBoxes.Instance.GetBlocks(_words); + } + + [Benchmark] + public IReadOnlyList DuplicateOverlappingText() + { + return DuplicateOverlappingTextProcessor.Get(_letters).ToArray(); + } +} \ No newline at end of file diff --git a/tools/UglyToad.PdfPig.Benchmarks/NuGetPackageConfig.cs b/tools/UglyToad.PdfPig.Benchmarks/NuGetPackageConfig.cs new file mode 100644 index 000000000..d36bd6c97 --- /dev/null +++ b/tools/UglyToad.PdfPig.Benchmarks/NuGetPackageConfig.cs @@ -0,0 +1,25 @@ +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Environments; +using BenchmarkDotNet.Jobs; + +namespace UglyToad.PdfPig.Benchmarks; + +internal class NuGetPackageConfig : ManualConfig +{ + public NuGetPackageConfig() + { + var baseJob = Job.Default; + + var localJob = baseJob + .WithMsBuildArguments("/p:PdfPigVersion=Local") + .WithId("Local"); + + var latestJob = baseJob + .WithMsBuildArguments("/p:PdfPigVersion=Latest") + .WithId("Latest") + .AsBaseline(); + + AddJob(localJob.WithRuntime(CoreRuntime.Core80)); + AddJob(latestJob.WithRuntime(CoreRuntime.Core80)); + } +} \ No newline at end of file diff --git a/tools/UglyToad.PdfPig.Benchmarks/Program.cs b/tools/UglyToad.PdfPig.Benchmarks/Program.cs new file mode 100644 index 000000000..1995e7cdb --- /dev/null +++ b/tools/UglyToad.PdfPig.Benchmarks/Program.cs @@ -0,0 +1,13 @@ +using BenchmarkDotNet.Running; + +namespace UglyToad.PdfPig.Benchmarks +{ + internal class Program + { + static void Main(string[] args) + { + var summary = BenchmarkRunner.Run(); + Console.ReadKey(); + } + } +} diff --git a/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.csproj b/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.csproj new file mode 100644 index 000000000..7c77863b5 --- /dev/null +++ b/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.csproj @@ -0,0 +1,31 @@ + + + + Exe + net8.0 + enable + enable + + Local + + + + + + + + + + + + + + + + + + Always + + + + \ No newline at end of file diff --git a/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.slnx b/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.slnx new file mode 100644 index 000000000..d357ab7a2 --- /dev/null +++ b/tools/UglyToad.PdfPig.Benchmarks/UglyToad.PdfPig.Benchmarks.slnx @@ -0,0 +1,6 @@ + + + + + + diff --git a/tools/UglyToad.PdfPig.Benchmarks/fseprd1102849.pdf b/tools/UglyToad.PdfPig.Benchmarks/fseprd1102849.pdf new file mode 100644 index 000000000..a08d7ecc2 Binary files /dev/null and b/tools/UglyToad.PdfPig.Benchmarks/fseprd1102849.pdf differ