1- # -*- coding: utf-8 -*-
2-
3- '''A group of ``Shape`` instances.
4- '''
1+ '''A group of ``Shape`` instances.'''
52
63from .Shape import Shape , Stroke , Fill , Hyperlink
74from ..common .share import RectType
@@ -26,30 +23,24 @@ def restore(self, raws:list):
2623 shape = Fill (raw )
2724 # add to list
2825 self .append (shape )
29-
3026 return self
3127
3228
33- def _update_bbox (self , shape :Shape ):
29+ def _update_bbox (self , e :Shape ):
3430 ''' override. Do nothing.'''
35- pass
3631
3732
3833 @property
3934 def strokes (self ):
40- ''' Stroke Shapes, including table border, text underline and strike-through.
41- Cache it once calculated since it doesn't change generally.
42- '''
35+ ''' Stroke Shapes, including table border, text underline and strike-through.'''
4336 instances = list (filter (
4437 lambda shape : isinstance (shape , Stroke ), self ._instances ))
4538 return Shapes (instances )
4639
4740
4841 @property
4942 def fillings (self ):
50- ''' Fill Shapes, including cell shading and highlight.
51- Cache it once calculated since it doesn't change generally.
52- '''
43+ ''' Fill Shapes, including cell shading and highlight.'''
5344 # white bg-color is by default, so ignore those fillings
5445 instances = list (filter (
5546 lambda shape : isinstance (shape , Fill ) and \
@@ -72,21 +63,24 @@ def table_strokes(self):
7263 lambda shape : shape .has_potential_type (RectType .BORDER ), self ._instances ))
7364 return ElementCollection (instances )
7465
75-
66+
7667 @property
7768 def table_fillings (self ):
7869 '''Potential table shadings.'''
7970 instances = list (filter (
8071 lambda shape : shape .has_potential_type (RectType .SHADING ), self ._instances ))
8172 return ElementCollection (instances )
82-
73+
74+
8375 @property
8476 def text_style_shapes (self ):
85- '''Potential text style based shapes, e.g. underline, strike-through, highlight and hyperlink.'''
86- f = lambda shape : shape .has_potential_type (RectType .HIGHLIGHT ) or \
87- shape .has_potential_type (RectType .UNDERLINE ) or \
88- shape .has_potential_type (RectType .STRIKE ) or \
89- shape .has_potential_type (RectType .HYPERLINK )
77+ '''Potential text style based shapes,
78+ e.g. underline, strike-through, highlight and hyperlink.'''
79+ def f (shape ):
80+ return shape .has_potential_type (RectType .HIGHLIGHT ) or \
81+ shape .has_potential_type (RectType .UNDERLINE ) or \
82+ shape .has_potential_type (RectType .STRIKE ) or \
83+ shape .has_potential_type (RectType .HYPERLINK )
9084 instances = set (filter (f , self ._instances ))
9185 return ElementCollection (instances )
9286
@@ -101,19 +95,24 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
10195
10296 Args:
10397 max_border_width (float): The max border width.
104- shape_min_dimension (float): Ignore shape if both width and height is lower than this value.
98+ shape_min_dimension (float): Ignore shape if both width and height
99+ is lower than this value.
105100 """
106101 if not self ._instances : return
107102
108- # remove small shapes or shapes out of page
103+ # remove small shapes or shapes out of page; and
104+ # update bbox in case part of the shape is out of page
109105 page_bbox = self .parent .bbox
110- f = lambda shape : shape .bbox .intersects (page_bbox ) and \
111- max (shape .bbox .width , shape .bbox .height )>= shape_min_dimension
112- cleaned_shapes = list (filter (f , self ._instances )) # type: list[Shape]
106+ cleaned_shapes = [] # type: list[Shape]
107+ for s in self :
108+ if max (s .bbox .width , s .bbox .height )< shape_min_dimension : continue # small shapes
109+ bbox_in_page = s .bbox .intersect (page_bbox )
110+ if bbox_in_page .is_empty : continue # shapes out of page
111+ cleaned_shapes .append (s .update_bbox (bbox_in_page )) # ignore out of page part
113112
114113 # merge normal shapes if same filling color
115114 merged_shapes = self ._merge_shapes (cleaned_shapes )
116-
115+
117116 # convert Fill instance to Stroke if looks like stroke
118117 shapes = []
119118 for shape in merged_shapes :
@@ -126,7 +125,7 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
126125
127126 # detect semantic type
128127 self ._parse_semantic_type ()
129-
128+
130129
131130 def assign_to_tables (self , tables :list ):
132131 """Add Shape to associated cells of given tables.
@@ -136,7 +135,7 @@ def assign_to_tables(self, tables:list):
136135 """
137136 if not tables : return
138137
139- # assign shapes to table region
138+ # assign shapes to table region
140139 shapes_in_tables = [[] for _ in tables ] # type: list[list[Shape]]
141140 shapes = [] # type: list[Shape]
142141 for shape in self ._instances :
@@ -154,7 +153,7 @@ def assign_to_tables(self, tables:list):
154153 # not possible in current table, then check next table
155154 elif not table .bbox .intersects (shape .bbox ):
156155 continue
157-
156+
158157 # Now, this shape belongs to previous layout
159158 else :
160159 shapes .append (shape )
@@ -169,10 +168,11 @@ def assign_to_tables(self, tables:list):
169168
170169
171170 def plot (self , page ):
172- '''Plot shapes for debug purpose. Different colors are used to display the shapes in detected
173- semantic types, e.g. yellow for text based shape (stroke, underline and highlight). Due to
174- overlaps between Stroke and Fill related groups, some shapes are plot twice.
175-
171+ '''Plot shapes for debug purpose.
172+ Different colors are used to display the shapes in detected semantic types, e.g.
173+ yellow for text based shape (stroke, underline and highlight). Due to overlaps
174+ between Stroke and Fill related groups, some shapes are plot twice.
175+
176176 Args:
177177 page (fitz.Page): pdf page.
178178 '''
@@ -201,10 +201,10 @@ def _merge_shapes(shapes):
201201 # shapes excluding hyperlink first
202202 normal_shapes = list (filter (
203203 lambda shape : not shape .is_determined , shapes ))
204-
204+
205205 # group by color and connectivity (with margin considered)
206- f = lambda a , b : \
207- a .color == b .color and a .bbox .intersects (b .get_expand_bbox (constants .TINY_DIST ))
206+ def f ( a , b ):
207+ return a .color == b .color and a .bbox .intersects (b .get_expand_bbox (constants .TINY_DIST ))
208208 groups = Collection (normal_shapes ).group (f )
209209
210210 merged_shapes = []
@@ -215,22 +215,21 @@ def _merge_shapes(shapes):
215215 merged_shapes .append (group [0 ].update_bbox (group .bbox ))
216216 else :
217217 merged_shapes .extend (group )
218-
218+
219219 # add hyperlinks back
220220 hyperlinks = filter (lambda shape : shape .equal_to_type (RectType .HYPERLINK ), shapes )
221221 merged_shapes .extend (hyperlinks )
222-
223222 return merged_shapes
224223
225224
226225 def _parse_semantic_type (self ):
227- ''' Detect shape type based on the position to text blocks.
226+ ''' Detect shape type based on the position to text blocks.
228227
229228 .. note::
230- Stroke shapes are grouped on connectivity to each other, but in some cases,
229+ Stroke shapes are grouped on connectivity to each other, but in some cases,
231230 the gap between borders and underlines/strikes are very close, which leads
232231 to an incorrect table structure. So, it's required to distinguish them in
233- advance, though we needn't to ensure 100% accuracy. They are finally determined
232+ advance, though we needn't to ensure 100% accuracy. They are finally determined
234233 when parsing table structure and text format.
235234 '''
236235 # blocks in page (the original blocks without any further processing)
@@ -240,4 +239,3 @@ def _parse_semantic_type(self):
240239 # check positions between shapes and text blocks
241240 for shape in self ._instances :
242241 shape .parse_semantic_type (blocks )
243-
0 commit comments