process shape partly out of page; ignore replacement character \ufffd; fix empty font name issue; #256

dothinking · dothinking · commit f30fb2bbbd90 · 2024-01-23T01:15:43.000+08:00
diff --git a/pdf2docx/font/Fonts.py b/pdf2docx/font/Fonts.py
@@ -70,6 +70,8 @@ def extract(cls, fitz_doc):
         fonts = []
         for xref in xrefs:
             basename, ext, _, buffer = fitz_doc.extract_font(xref)
+            if not basename: continue
+
             basename = decode(basename)
             name = cls._normalized_font_name(basename)
 
diff --git a/pdf2docx/shape/Shapes.py b/pdf2docx/shape/Shapes.py
@@ -1,7 +1,4 @@
-# -*- coding: utf-8 -*-
-
-'''A group of ``Shape`` instances.
-'''
+'''A group of ``Shape`` instances.'''
 
 from .Shape import Shape, Stroke, Fill, Hyperlink
 from ..common.share import RectType
@@ -26,30 +23,24 @@ def restore(self, raws:list):
                 shape = Fill(raw)
             # add to list
             self.append(shape)
-        
         return self
 
 
-    def _update_bbox(self, shape:Shape):
+    def _update_bbox(self, e:Shape):
         ''' override. Do nothing.'''
-        pass
 
 
     @property
     def strokes(self):
-        ''' Stroke Shapes, including table border, text underline and strike-through. 
-            Cache it once calculated since it doesn't change generally.
-        '''
+        ''' Stroke Shapes, including table border, text underline and strike-through.'''
         instances = list(filter(
             lambda shape: isinstance(shape, Stroke), self._instances))
         return Shapes(instances)
 
 
     @property
     def fillings(self):
-        ''' Fill Shapes, including cell shading and highlight. 
-            Cache it once calculated since it doesn't change generally.
-        '''
+        ''' Fill Shapes, including cell shading and highlight.'''
         # white bg-color is by default, so ignore those fillings
         instances = list(filter(
             lambda shape: isinstance(shape, Fill) and \
@@ -72,21 +63,24 @@ def table_strokes(self):
             lambda shape: shape.has_potential_type(RectType.BORDER), self._instances))
         return ElementCollection(instances)
 
-    
+
     @property
     def table_fillings(self):
         '''Potential table shadings.'''
         instances = list(filter(
             lambda shape: shape.has_potential_type(RectType.SHADING), self._instances))
         return ElementCollection(instances)
-    
+
+
     @property
     def text_style_shapes(self):
-        '''Potential text style based shapes, e.g. underline, strike-through, highlight and hyperlink.'''
-        f = lambda shape: shape.has_potential_type(RectType.HIGHLIGHT) or \
-                            shape.has_potential_type(RectType.UNDERLINE) or \
-                            shape.has_potential_type(RectType.STRIKE) or \
-                            shape.has_potential_type(RectType.HYPERLINK)
+        '''Potential text style based shapes,
+        e.g. underline, strike-through, highlight and hyperlink.'''
+        def f(shape):
+            return shape.has_potential_type(RectType.HIGHLIGHT) or \
+                    shape.has_potential_type(RectType.UNDERLINE) or \
+                    shape.has_potential_type(RectType.STRIKE) or \
+                    shape.has_potential_type(RectType.HYPERLINK)
         instances = set(filter(f, self._instances))
         return ElementCollection(instances)
 
@@ -101,19 +95,24 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
 
         Args:
             max_border_width (float): The max border width.
-            shape_min_dimension (float): Ignore shape if both width and height is lower than this value.
+            shape_min_dimension (float): Ignore shape if both width and height
+                is lower than this value.
         """
         if not self._instances: return
 
-        # remove small shapes or shapes out of page
+        # remove small shapes or shapes out of page; and
+        # update bbox in case part of the shape is out of page
         page_bbox = self.parent.bbox
-        f = lambda shape: shape.bbox.intersects(page_bbox) and \
-                        max(shape.bbox.width, shape.bbox.height)>=shape_min_dimension
-        cleaned_shapes = list(filter(f, self._instances)) # type: list[Shape]
+        cleaned_shapes = [] # type: list[Shape]
+        for s in self:
+            if max(s.bbox.width, s.bbox.height)<shape_min_dimension: continue # small shapes
+            bbox_in_page = s.bbox.intersect(page_bbox)
+            if bbox_in_page.is_empty: continue # shapes out of page
+            cleaned_shapes.append(s.update_bbox(bbox_in_page)) # ignore out of page part
 
         # merge normal shapes if same filling color
         merged_shapes = self._merge_shapes(cleaned_shapes)
-                
+
         # convert Fill instance to Stroke if looks like stroke
         shapes = []
         for shape in merged_shapes:
@@ -126,7 +125,7 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
 
         # detect semantic type
         self._parse_semantic_type()
-    
+
 
     def assign_to_tables(self, tables:list):
         """Add Shape to associated cells of given tables.
@@ -136,7 +135,7 @@ def assign_to_tables(self, tables:list):
         """
         if not tables: return
 
-        # assign shapes to table region        
+        # assign shapes to table region
         shapes_in_tables = [[] for _ in tables] # type: list[list[Shape]]
         shapes = []   # type: list[Shape]
         for shape in self._instances:
@@ -154,7 +153,7 @@ def assign_to_tables(self, tables:list):
                 # not possible in current table, then check next table
                 elif not table.bbox.intersects(shape.bbox):
                     continue
-            
+
             # Now, this shape belongs to previous layout
             else:
                 shapes.append(shape)
@@ -169,10 +168,11 @@ def assign_to_tables(self, tables:list):
 
 
     def plot(self, page):
-        '''Plot shapes for debug purpose. Different colors are used to display the shapes in detected 
-        semantic types, e.g. yellow for text based shape (stroke, underline and highlight). Due to 
-        overlaps between Stroke and Fill related groups, some shapes are plot twice.
-        
+        '''Plot shapes for debug purpose.
+        Different colors are used to display the shapes in detected semantic types, e.g.
+        yellow for text based shape (stroke, underline and highlight). Due to overlaps
+        between Stroke and Fill related groups, some shapes are plot twice.
+
         Args:
             page (fitz.Page): pdf page.
         '''
@@ -201,10 +201,10 @@ def _merge_shapes(shapes):
         # shapes excluding hyperlink first
         normal_shapes = list(filter(
             lambda shape: not shape.is_determined, shapes))
-        
+
         # group by color and connectivity (with margin considered)
-        f = lambda a, b: \
-            a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
+        def f(a, b):
+            return a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
         groups = Collection(normal_shapes).group(f)
 
         merged_shapes = []
@@ -215,22 +215,21 @@ def _merge_shapes(shapes):
                 merged_shapes.append(group[0].update_bbox(group.bbox))
             else:
                 merged_shapes.extend(group)
-        
+
         # add hyperlinks back
         hyperlinks = filter(lambda shape: shape.equal_to_type(RectType.HYPERLINK), shapes)
         merged_shapes.extend(hyperlinks)
-        
         return merged_shapes
 
 
     def _parse_semantic_type(self):
-        ''' Detect shape type based on the position to text blocks. 
+        ''' Detect shape type based on the position to text blocks.
 
         .. note::
-            Stroke shapes are grouped on connectivity to each other, but in some cases, 
+            Stroke shapes are grouped on connectivity to each other, but in some cases,
             the gap between borders and underlines/strikes are very close, which leads
             to an incorrect table structure. So, it's required to distinguish them in
-            advance, though we needn't to ensure 100% accuracy. They are finally determined 
+            advance, though we needn't to ensure 100% accuracy. They are finally determined
             when parsing table structure and text format.
         '''
         # blocks in page (the original blocks without any further processing)
@@ -240,4 +239,3 @@ def _parse_semantic_type(self):
         # check positions between shapes and text blocks
         for shape in self._instances:
             shape.parse_semantic_type(blocks)
-
diff --git a/pdf2docx/text/TextSpan.py b/pdf2docx/text/TextSpan.py
@@ -47,7 +47,8 @@ def __init__(self, raw:dict=None):
 
         # filter empty chars
         chars = [Char(c) for c in raw.get('chars', [])] # type: list[Char]
-        self.chars = [char for char in chars if char.c!='']
+        # ignore replacement character, see issue#256
+        self.chars = [char for char in chars if char.c not in ('', '\ufffd')]
         self._text = raw.get('text', '') # not an original key from PyMuPDF
 
         # font metrics
@@ -85,7 +86,7 @@ def text(self):
     def text(self, value):
         '''Set span text directly in case no chars are stores, e.g. restored from json.'''
         self._text = value
-    
+
     def cal_bbox(self):
         '''Calculate bbox based on contained instances.'''
         bbox = fitz.Rect()
@@ -306,7 +307,8 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True):
         # highlight: both the rect height and overlap must be large enough
         if h_rect >= 0.5*h_span:
             # In general, highlight color isn't white
-            if rect.color != rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR):
+            if rect.color != rgb_value((1,1,1)) and \
+                self.get_main_bbox(rect, constants.FACTOR_MAJOR):
                 rect.type = RectType.HIGHLIGHT
 
         # near to bottom of span? yes, underline
@@ -400,7 +402,7 @@ def _set_text_format(self, docx_run):
         # font name
         font_name = self.font
         docx_run.font.name = font_name
-        docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # set font for chinese characters
+        docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # for CJK characters
         docx_run.font.color.rgb = RGBColor(*rgb_component(self.color))
 
         # font size
@@ -419,8 +421,8 @@ def _set_text_format(self, docx_run):
         for style in self.style:
 
             t = style['type']
-            # Built-in method is provided to set highlight in python-docx, but supports only limited colors;
-            # so, set character shading instead if out of highlight color scope
+            # Built-in method is provided to set highlight in python-docx,but supports only
+            # limited colors; so, set character shading instead if out of highlight color scope.
             if t==RectType.HIGHLIGHT.value:
                 docx.set_char_shading(docx_run, style['color'])