Skip to content

Commit f30fb2b

Browse files
committed
process shape partly out of page; ignore replacement character \ufffd; fix empty font name issue; #256
1 parent e317334 commit f30fb2b

File tree

3 files changed

+50
-48
lines changed

3 files changed

+50
-48
lines changed

pdf2docx/font/Fonts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ def extract(cls, fitz_doc):
7070
fonts = []
7171
for xref in xrefs:
7272
basename, ext, _, buffer = fitz_doc.extract_font(xref)
73+
if not basename: continue
74+
7375
basename = decode(basename)
7476
name = cls._normalized_font_name(basename)
7577

pdf2docx/shape/Shapes.py

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
# -*- coding: utf-8 -*-
2-
3-
'''A group of ``Shape`` instances.
4-
'''
1+
'''A group of ``Shape`` instances.'''
52

63
from .Shape import Shape, Stroke, Fill, Hyperlink
74
from ..common.share import RectType
@@ -26,30 +23,24 @@ def restore(self, raws:list):
2623
shape = Fill(raw)
2724
# add to list
2825
self.append(shape)
29-
3026
return self
3127

3228

33-
def _update_bbox(self, shape:Shape):
29+
def _update_bbox(self, e:Shape):
3430
''' override. Do nothing.'''
35-
pass
3631

3732

3833
@property
3934
def strokes(self):
40-
''' Stroke Shapes, including table border, text underline and strike-through.
41-
Cache it once calculated since it doesn't change generally.
42-
'''
35+
''' Stroke Shapes, including table border, text underline and strike-through.'''
4336
instances = list(filter(
4437
lambda shape: isinstance(shape, Stroke), self._instances))
4538
return Shapes(instances)
4639

4740

4841
@property
4942
def fillings(self):
50-
''' Fill Shapes, including cell shading and highlight.
51-
Cache it once calculated since it doesn't change generally.
52-
'''
43+
''' Fill Shapes, including cell shading and highlight.'''
5344
# white bg-color is by default, so ignore those fillings
5445
instances = list(filter(
5546
lambda shape: isinstance(shape, Fill) and \
@@ -72,21 +63,24 @@ def table_strokes(self):
7263
lambda shape: shape.has_potential_type(RectType.BORDER), self._instances))
7364
return ElementCollection(instances)
7465

75-
66+
7667
@property
7768
def table_fillings(self):
7869
'''Potential table shadings.'''
7970
instances = list(filter(
8071
lambda shape: shape.has_potential_type(RectType.SHADING), self._instances))
8172
return ElementCollection(instances)
82-
73+
74+
8375
@property
8476
def text_style_shapes(self):
85-
'''Potential text style based shapes, e.g. underline, strike-through, highlight and hyperlink.'''
86-
f = lambda shape: shape.has_potential_type(RectType.HIGHLIGHT) or \
87-
shape.has_potential_type(RectType.UNDERLINE) or \
88-
shape.has_potential_type(RectType.STRIKE) or \
89-
shape.has_potential_type(RectType.HYPERLINK)
77+
'''Potential text style based shapes,
78+
e.g. underline, strike-through, highlight and hyperlink.'''
79+
def f(shape):
80+
return shape.has_potential_type(RectType.HIGHLIGHT) or \
81+
shape.has_potential_type(RectType.UNDERLINE) or \
82+
shape.has_potential_type(RectType.STRIKE) or \
83+
shape.has_potential_type(RectType.HYPERLINK)
9084
instances = set(filter(f, self._instances))
9185
return ElementCollection(instances)
9286

@@ -101,19 +95,24 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
10195
10296
Args:
10397
max_border_width (float): The max border width.
104-
shape_min_dimension (float): Ignore shape if both width and height is lower than this value.
98+
shape_min_dimension (float): Ignore shape if both width and height
99+
is lower than this value.
105100
"""
106101
if not self._instances: return
107102

108-
# remove small shapes or shapes out of page
103+
# remove small shapes or shapes out of page; and
104+
# update bbox in case part of the shape is out of page
109105
page_bbox = self.parent.bbox
110-
f = lambda shape: shape.bbox.intersects(page_bbox) and \
111-
max(shape.bbox.width, shape.bbox.height)>=shape_min_dimension
112-
cleaned_shapes = list(filter(f, self._instances)) # type: list[Shape]
106+
cleaned_shapes = [] # type: list[Shape]
107+
for s in self:
108+
if max(s.bbox.width, s.bbox.height)<shape_min_dimension: continue # small shapes
109+
bbox_in_page = s.bbox.intersect(page_bbox)
110+
if bbox_in_page.is_empty: continue # shapes out of page
111+
cleaned_shapes.append(s.update_bbox(bbox_in_page)) # ignore out of page part
113112

114113
# merge normal shapes if same filling color
115114
merged_shapes = self._merge_shapes(cleaned_shapes)
116-
115+
117116
# convert Fill instance to Stroke if looks like stroke
118117
shapes = []
119118
for shape in merged_shapes:
@@ -126,7 +125,7 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
126125

127126
# detect semantic type
128127
self._parse_semantic_type()
129-
128+
130129

131130
def assign_to_tables(self, tables:list):
132131
"""Add Shape to associated cells of given tables.
@@ -136,7 +135,7 @@ def assign_to_tables(self, tables:list):
136135
"""
137136
if not tables: return
138137

139-
# assign shapes to table region
138+
# assign shapes to table region
140139
shapes_in_tables = [[] for _ in tables] # type: list[list[Shape]]
141140
shapes = [] # type: list[Shape]
142141
for shape in self._instances:
@@ -154,7 +153,7 @@ def assign_to_tables(self, tables:list):
154153
# not possible in current table, then check next table
155154
elif not table.bbox.intersects(shape.bbox):
156155
continue
157-
156+
158157
# Now, this shape belongs to previous layout
159158
else:
160159
shapes.append(shape)
@@ -169,10 +168,11 @@ def assign_to_tables(self, tables:list):
169168

170169

171170
def plot(self, page):
172-
'''Plot shapes for debug purpose. Different colors are used to display the shapes in detected
173-
semantic types, e.g. yellow for text based shape (stroke, underline and highlight). Due to
174-
overlaps between Stroke and Fill related groups, some shapes are plot twice.
175-
171+
'''Plot shapes for debug purpose.
172+
Different colors are used to display the shapes in detected semantic types, e.g.
173+
yellow for text based shape (stroke, underline and highlight). Due to overlaps
174+
between Stroke and Fill related groups, some shapes are plot twice.
175+
176176
Args:
177177
page (fitz.Page): pdf page.
178178
'''
@@ -201,10 +201,10 @@ def _merge_shapes(shapes):
201201
# shapes excluding hyperlink first
202202
normal_shapes = list(filter(
203203
lambda shape: not shape.is_determined, shapes))
204-
204+
205205
# group by color and connectivity (with margin considered)
206-
f = lambda a, b: \
207-
a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
206+
def f(a, b):
207+
return a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
208208
groups = Collection(normal_shapes).group(f)
209209

210210
merged_shapes = []
@@ -215,22 +215,21 @@ def _merge_shapes(shapes):
215215
merged_shapes.append(group[0].update_bbox(group.bbox))
216216
else:
217217
merged_shapes.extend(group)
218-
218+
219219
# add hyperlinks back
220220
hyperlinks = filter(lambda shape: shape.equal_to_type(RectType.HYPERLINK), shapes)
221221
merged_shapes.extend(hyperlinks)
222-
223222
return merged_shapes
224223

225224

226225
def _parse_semantic_type(self):
227-
''' Detect shape type based on the position to text blocks.
226+
''' Detect shape type based on the position to text blocks.
228227
229228
.. note::
230-
Stroke shapes are grouped on connectivity to each other, but in some cases,
229+
Stroke shapes are grouped on connectivity to each other, but in some cases,
231230
the gap between borders and underlines/strikes are very close, which leads
232231
to an incorrect table structure. So, it's required to distinguish them in
233-
advance, though we needn't to ensure 100% accuracy. They are finally determined
232+
advance, though we needn't to ensure 100% accuracy. They are finally determined
234233
when parsing table structure and text format.
235234
'''
236235
# blocks in page (the original blocks without any further processing)
@@ -240,4 +239,3 @@ def _parse_semantic_type(self):
240239
# check positions between shapes and text blocks
241240
for shape in self._instances:
242241
shape.parse_semantic_type(blocks)
243-

pdf2docx/text/TextSpan.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def __init__(self, raw:dict=None):
4747

4848
# filter empty chars
4949
chars = [Char(c) for c in raw.get('chars', [])] # type: list[Char]
50-
self.chars = [char for char in chars if char.c!='']
50+
# ignore replacement character, see issue#256
51+
self.chars = [char for char in chars if char.c not in ('', '\ufffd')]
5152
self._text = raw.get('text', '') # not an original key from PyMuPDF
5253

5354
# font metrics
@@ -85,7 +86,7 @@ def text(self):
8586
def text(self, value):
8687
'''Set span text directly in case no chars are stores, e.g. restored from json.'''
8788
self._text = value
88-
89+
8990
def cal_bbox(self):
9091
'''Calculate bbox based on contained instances.'''
9192
bbox = fitz.Rect()
@@ -306,7 +307,8 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True):
306307
# highlight: both the rect height and overlap must be large enough
307308
if h_rect >= 0.5*h_span:
308309
# In general, highlight color isn't white
309-
if rect.color != rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR):
310+
if rect.color != rgb_value((1,1,1)) and \
311+
self.get_main_bbox(rect, constants.FACTOR_MAJOR):
310312
rect.type = RectType.HIGHLIGHT
311313

312314
# near to bottom of span? yes, underline
@@ -400,7 +402,7 @@ def _set_text_format(self, docx_run):
400402
# font name
401403
font_name = self.font
402404
docx_run.font.name = font_name
403-
docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # set font for chinese characters
405+
docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # for CJK characters
404406
docx_run.font.color.rgb = RGBColor(*rgb_component(self.color))
405407

406408
# font size
@@ -419,8 +421,8 @@ def _set_text_format(self, docx_run):
419421
for style in self.style:
420422

421423
t = style['type']
422-
# Built-in method is provided to set highlight in python-docx, but supports only limited colors;
423-
# so, set character shading instead if out of highlight color scope
424+
# Built-in method is provided to set highlight in python-docx,but supports only
425+
# limited colors; so, set character shading instead if out of highlight color scope.
424426
if t==RectType.HIGHLIGHT.value:
425427
docx.set_char_shading(docx_run, style['color'])
426428

0 commit comments

Comments
 (0)