Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,15 @@ public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/

int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
int[] indexes = new int[elements.Count];
#if NET6_0_OR_GREATER
Array.Fill(indexes, -1);
#else
for (int k = 0; k < indexes.Length; k++)
{
indexes[k] = -1;
}
#endif
KdTree<T> kdTree = new KdTree<T>(elements, candidatesPoint);

ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Expand Down Expand Up @@ -118,7 +126,15 @@ public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/

int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
int[] indexes = new int[elements.Count];
#if NET6_0_OR_GREATER
Array.Fill(indexes, -1);
#else
for (int l = 0; l < indexes.Length; l++)
{
indexes[l] = -1;
}
#endif
KdTree<T> kdTree = new KdTree<T>(elements, candidatesPoint);

ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Expand Down Expand Up @@ -186,7 +202,15 @@ public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/

int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
int[] indexes = new int[elements.Count];
#if NET6_0_OR_GREATER
Array.Fill(indexes, -1);
#else
for (int k = 0; k < indexes.Length; k++)
{
indexes[k] = -1;
}
#endif

ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };

Expand Down
7 changes: 2 additions & 5 deletions src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using Core;
using System;
using System.Collections.Generic;
using System.Linq;

/// <summary>
/// Contains helpful tools for distance measures.
Expand Down Expand Up @@ -169,12 +168,11 @@ public static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,

distance = double.MaxValue;
int closestPointIndex = -1;
var candidatesPoints = candidates.Select(candidatePoint).ToList();
var pivot = pivotPoint(element);

for (var i = 0; i < candidates.Count; i++)
{
double currentDistance = distanceMeasure(pivot, candidatesPoints[i]);
double currentDistance = distanceMeasure(pivot, candidatePoint(candidates[i]));
if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;
Expand Down Expand Up @@ -211,12 +209,11 @@ public static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,

distance = double.MaxValue;
int closestLineIndex = -1;
var candidatesLines = candidates.Select(candidateLine).ToList();
var pivot = pivotLine(element);

for (var i = 0; i < candidates.Count; i++)
{
double currentDistance = distanceMeasure(pivot, candidatesLines[i]);
double currentDistance = distanceMeasure(pivot, candidateLine(candidates[i]));
if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.PdfFonts;

/// <summary>
/// Checks if each letter is a duplicate and overlaps any other letter and remove the duplicate, and flag the remaining as bold.
Expand All @@ -24,41 +23,51 @@ public static IReadOnlyList<Letter> Get(IEnumerable<Letter> letters)
return letters?.ToList();
}

var queue = new Queue<Letter>(letters);
var cleanLetters = new List<Letter>() { queue.Dequeue() }; // dequeue the first letter
// Use a dictionary keyed by (Value, FontName) to look up candidate duplicates in O(1)
var duplicateIndex = new Dictionary<(string, string), List<int>>();
var cleanLetters = new List<Letter>();

while (queue.Count > 0)
foreach (var letter in letters)
{
var letter = queue.Dequeue();
bool addLetter = true;
int duplicatesOverlappingIndex = -1;

var duplicates = cleanLetters.Where(l => l.Value.Equals(letter.Value) && l.FontName.Equals(letter.FontName)); // do other checks?

if (duplicates.Any())
var key = (letter.Value, letter.FontName);
if (duplicateIndex.TryGetValue(key, out var candidateIndices))
{
double tolerance = letter.GlyphRectangle.Width / (letter.Value.Length == 0 ? 1 : letter.Value.Length) / 3.0;
double minX = letter.GlyphRectangle.BottomLeft.X - tolerance;
double maxX = letter.GlyphRectangle.BottomLeft.X + tolerance;
double minY = letter.GlyphRectangle.BottomLeft.Y - tolerance;
double maxY = letter.GlyphRectangle.BottomLeft.Y + tolerance;

var duplicatesOverlapping = duplicates.FirstOrDefault(l => minX <= l.GlyphRectangle.BottomLeft.X &&
maxX >= l.GlyphRectangle.BottomLeft.X &&
minY <= l.GlyphRectangle.BottomLeft.Y &&
maxY >= l.GlyphRectangle.BottomLeft.Y);

if (duplicatesOverlapping != default)
for (int ci = 0; ci < candidateIndices.Count; ci++)
{
// duplicate overlapping letter was found, keeping the existing one and not adding this one.
addLetter = false;
duplicatesOverlappingIndex = cleanLetters.IndexOf(duplicatesOverlapping);
int idx = candidateIndices[ci];
var l = cleanLetters[idx];
if (minX <= l.GlyphRectangle.BottomLeft.X &&
maxX >= l.GlyphRectangle.BottomLeft.X &&
minY <= l.GlyphRectangle.BottomLeft.Y &&
maxY >= l.GlyphRectangle.BottomLeft.Y)
{
addLetter = false;
duplicatesOverlappingIndex = idx;
break;
}
}
}

if (addLetter)
{
int newIndex = cleanLetters.Count;
cleanLetters.Add(letter);

if (!duplicateIndex.TryGetValue(key, out var list))
{
list = new List<int>();
duplicateIndex[key] = list;
}
list.Add(newIndex);
}
else if (duplicatesOverlappingIndex != -1)
{
Expand Down
Loading
Loading