open-edge-platform · sovrasov · Jul 4, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
 - Support encrypted dataset training (<https://github.com/openvinotoolkit/training_extensions/pull/2209>)
 - Add custom max iou assigner to prevent CPU OOM when large annotations are used (<https://github.com/openvinotoolkit/training_extensions/pull/2228>)
 - Auto train type detection for Semi-SL, Self-SL and Incremental: "--train-type" now is optional (https://github.com/openvinotoolkit/training_extensions/pull/2195)
+- Add per-class XAI saliency maps for Mask R-CNN model (https://github.com/openvinotoolkit/training_extensions/pull/2227)
 - Add new object detector Deformable DETR (<https://github.com/openvinotoolkit/training_extensions/pull/2249>)
 - Add new object detecotr DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2266>)
 - Add new visual prompting task (https://github.com/openvinotoolkit/training_extensions/pull/2203)
@@ -20,6 +21,7 @@ All notable changes to this project will be documented in this file.
 - Set persistent_workers and pin_memory as True in detection task (<https://github.com/openvinotoolkit/training_extensions/pull/2224>)
 - New algorithm for Semi-SL semantic segmentation based on metric lerning via class prototypes (https://github.com/openvinotoolkit/training_extensions/pull/2156)
 - Self-SL for classification now can recieve just folder with any images to start contrastive pretraining (https://github.com/openvinotoolkit/training_extensions/pull/2219)
+- Improve XAI saliency map generation for tiling detection and tiling instance segmentation (https://github.com/openvinotoolkit/training_extensions/pull/2240)
 
 ### Bug fixes
 

@@ -40,13 +40,15 @@ class BaseRecordingForwardHook(ABC):
         module (torch.nn.Module): The PyTorch module to be registered in forward pass
         fpn_idx (int, optional): The layer index to be processed if the model is a FPN.
                                   Defaults to 0 which uses the largest feature map from FPN.
+        normalize (bool): Whether to normalize the resulting saliency maps.
     """
 
-    def __init__(self, module: torch.nn.Module, fpn_idx: int = -1) -> None:
+    def __init__(self, module: torch.nn.Module, fpn_idx: int = -1, normalize: bool = True) -> None:
         self._module = module
         self._handle = None
         self._records: List[torch.Tensor] = []
         self._fpn_idx = fpn_idx
+        self._norm_saliency_maps = normalize
 
     @property
     def records(self):
@@ -97,12 +99,23 @@ def __exit__(self, exc_type, exc_value, traceback):
         """Exit."""
         self._handle.remove()
 
+    def _normalize_map(self, saliency_maps: torch.Tensor) -> torch.Tensor:
+        """Normalize saliency maps."""
+        max_values, _ = torch.max(saliency_maps, -1)
+        min_values, _ = torch.min(saliency_maps, -1)
+        if len(saliency_maps.shape) == 2:
+            saliency_maps = 255 * (saliency_maps - min_values[:, None]) / (max_values - min_values + 1e-12)[:, None]
+        else:
+            saliency_maps = (
+                255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
+            )
+        return saliency_maps.to(torch.uint8)
+
 
 class EigenCamHook(BaseRecordingForwardHook):
     """EigenCamHook."""
 
-    @staticmethod
-    def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
+    def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
         """Generate the saliency map."""
         if isinstance(feature_map, (list, tuple)):
             feature_map = feature_map[fpn_idx]
@@ -112,20 +125,19 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int
         reshaped_fmap = x.reshape((batch_size, channel, h * w)).transpose(1, 2)
         reshaped_fmap = reshaped_fmap - reshaped_fmap.mean(1)[:, None, :]
         _, _, vh = torch.linalg.svd(reshaped_fmap, full_matrices=True)  # pylint: disable=invalid-name
-        saliency_map = (reshaped_fmap @ vh[:, 0][:, :, None]).squeeze(-1)
-        max_values, _ = torch.max(saliency_map, -1)
-        min_values, _ = torch.min(saliency_map, -1)
-        saliency_map = 255 * (saliency_map - min_values[:, None]) / ((max_values - min_values + 1e-12)[:, None])
+
+        if self._norm_saliency_maps:
+            saliency_map = (reshaped_fmap @ vh[:, 0][:, :, None]).squeeze(-1)
+            self._normalize_map(saliency_map)
+
         saliency_map = saliency_map.reshape((batch_size, h, w))
-        saliency_map = saliency_map.to(torch.uint8)
         return saliency_map
 
 
 class ActivationMapHook(BaseRecordingForwardHook):
     """ActivationMapHook."""
 
-    @staticmethod
-    def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
+    def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
         """Generate the saliency map by average feature maps then normalizing to (0, 255)."""
         if isinstance(feature_map, (list, tuple)):
             assert fpn_idx < len(
@@ -135,12 +147,12 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int
 
         batch_size, _, h, w = feature_map.size()
         activation_map = torch.mean(feature_map, dim=1)
-        activation_map = activation_map.reshape((batch_size, h * w))
-        max_values, _ = torch.max(activation_map, -1)
-        min_values, _ = torch.min(activation_map, -1)
-        activation_map = 255 * (activation_map - min_values[:, None]) / (max_values - min_values + 1e-12)[:, None]
+
+        if self._norm_saliency_maps:
+            activation_map = activation_map.reshape((batch_size, h * w))
+            activation_map = self._normalize_map(activation_map)
+
         activation_map = activation_map.reshape((batch_size, h, w))
-        activation_map = activation_map.to(torch.uint8)
         return activation_map
 
 
@@ -193,12 +205,11 @@ def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx
             mosaic_prediction = self._predict_from_feature_map(mosaic_feature_map)
             saliency_maps[f] = mosaic_prediction.transpose(0, 1).reshape((self._num_classes, h, w))
 
-        saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h * w))
-        max_values, _ = torch.max(saliency_maps, -1)
-        min_values, _ = torch.min(saliency_maps, -1)
-        saliency_maps = 255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
+        if self._norm_saliency_maps:
+            saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h * w))
+            saliency_maps = self._normalize_map(saliency_maps)
+
         saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h, w))
-        saliency_maps = saliency_maps.to(torch.uint8)
         return saliency_maps
 
     def _predict_from_feature_map(self, x: torch.Tensor) -> torch.Tensor:

@@ -390,3 +390,44 @@ def merge(self, results) -> Union[List[Tuple[np.ndarray, list]], List[np.ndarray
             merged_results (list[list | tuple]): Merged results of the dataset.
         """
         return self.tile_dataset.merge(results)
+
+    def merge_vectors(self, feature_vectors: List[np.ndarray], dump_vectors: bool) -> Union[np.ndarray, List[None]]:
+        """Merge tile-level feature vectors to image-level feature-vector.
+
+        Args:
+            feature_vectors (list[np.ndarray]): tile-level feature vectors.
+            dump_vectors (bool): whether to dump vectors.
+
+        Returns:
+            merged_vectors (np.ndarray | List[None]): Merged vector for each image.
+        """
+
+        if dump_vectors:
+            return self.tile_dataset.merge_vectors(feature_vectors)
+        else:
+            return [None] * self.num_samples
+
+    def merge_maps(self, saliency_maps: List, dump_maps: bool) -> List:
+        """Merge tile-level saliency maps to image-level saliency map.
+
+        Args:
+            saliency_maps (list[list | np.ndarray]): tile-level saliency maps.
+            dump_maps (bool): whether to dump saliency maps.
+
+        Returns:
+            merged_maps (List[list | np.ndarray | None]): Merged saliency map for each image.
+        """
+
+        if dump_maps:
+            if not (np.array(saliency_maps) == None).all():  # noqa
+                return self.tile_dataset.merge_maps(saliency_maps)
+            else:
+                # retutn None for each class for each image
+                return saliency_maps[: self.num_samples]
+        else:
+            return [None] * self.num_samples
+
+    def __del__(self):
+        """Delete the temporary directory when the object is deleted."""
+        if getattr(self, "tmp_dir", False):
+            self.tmp_dir.cleanup()
@@ -10,11 +10,14 @@
 from time import time
 from typing import Callable, Dict, List, Tuple, Union
 
+import cv2
 import numpy as np
 from mmcv.ops import nms
 from mmdet.core import BitmapMasks, bbox2result
 from tqdm import tqdm
 
+from otx.api.utils.dataset_utils import non_linear_normalization
+
 
 def timeit(func) -> Callable:
     """Decorator to measure time of function execution.
@@ -471,3 +474,90 @@ def get_ann_info(self, idx):
         if "gt_labels" in self.tiles[idx]:
             ann["labels"] = self.tiles[idx]["gt_labels"]
         return ann
+
+    def merge_vectors(self, feature_vectors: List[np.ndarray]) -> np.ndarray:
+        """Merge tile-level feature vectors to image-level feature vector.
+
+        Args:
+            feature_vectors (List[np.ndarray]): tile-level feature vectors.
+
+        Returns:
+            merged_vectors (List[np.ndarray]): Merged vectors for each image.
+        """
+
+        vect_per_image = len(feature_vectors) // self.num_images
+        # split vectors on chunks of vectors related to the same image
+        image_vectors = [
+            feature_vectors[x : x + vect_per_image] for x in range(0, len(feature_vectors), vect_per_image)
+        ]
+        return np.average(image_vectors, axis=1)
+
+    def merge_maps(self, saliency_maps: Union[List[List[np.ndarray]], List[np.ndarray]]) -> List:
+        """Merge tile-level saliency maps to image-level saliency map.
+
+        Args:
+            saliency_maps (List[List[np.array] | np.ndarray]): tile-level saliency maps.
+            Each map is a list of maps for each detected class or None if class wasn't detected.
+
+        Returns:
+            merged_maps (List[list | np.ndarray | None]): Merged saliency maps for each image.
+        """
+        merged_maps = []
+        ratios = {}
+        num_classes = len(saliency_maps[0])
+        feat_h, feat_w = saliency_maps[0][0].shape
+        dtype = saliency_maps[0][0][0].dtype
+
+        for orig_image in self.cached_results:
+            img_idx = orig_image["index"]
+            ratios[img_idx] = np.array([feat_h, feat_w]) / self.tile_size
+            image_h, image_w = orig_image["height"], orig_image["width"]
+
+            image_map_h = int(image_h * ratios[img_idx][0])
+            image_map_w = int(image_w * ratios[img_idx][1])
+            merged_maps.append([np.zeros((image_map_h, image_map_w)) for _ in range(num_classes)])
+
+        for map, tile in zip(saliency_maps[self.num_images :], self.tiles[self.num_images :]):
+            for class_idx in range(num_classes):
+                if map[class_idx] is None:
+                    continue
+                cls_map = map[class_idx]
+                img_idx = tile["dataset_idx"]
+                x_1, y_1, x_2, y_2 = tile["tile_box"]
+                y_1, x_1 = ((y_1, x_1) * ratios[img_idx]).astype(np.uint16)
+                y_2, x_2 = ((y_2, x_2) * ratios[img_idx]).astype(np.uint16)
+
+                map_h, map_w = cls_map.shape
+                # resize feature map if it got from the tile which width and height is less the tile_size
+                if (map_h > y_2 - y_1 > 0) and (map_w > x_2 - x_1 > 0):
+                    cls_map = cv2.resize(cls_map, (x_2 - x_1, y_2 - y_1))
+                # cut the rest of the feature map that went out of the image borders
+                map_h, map_w = y_2 - y_1, x_2 - x_1
+
+                for hi, wi in [(h_, w_) for h_ in range(map_h) for w_ in range(map_w)]:
+                    map_pixel = cls_map[hi, wi]
+                    # on tile overlap add 0.5 value of each tile
+                    if merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] != 0:
+                        merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] = 0.5 * (
+                            map_pixel + merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi]
+                        )
+                    else:
+                        merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] = map_pixel
+
+        norm_maps = []
+        for merged_map, image_sal_map in zip(merged_maps, saliency_maps[: self.num_images]):
+            for class_idx in range(num_classes):
+                # don't have detections for this class on merged map
+                if (merged_map[class_idx] == 0).all():
+                    merged_map[class_idx] = None
+                else:
+                    image_map_cls = image_sal_map[class_idx]
+                    # resize the feature map for whole image to add it to merged saliency maps
+                    if image_map_cls is not None:
+                        map_h, map_w = merged_map[class_idx].shape
+                        image_map_cls = cv2.resize(image_map_cls, (map_w, map_h))
+                        merged_map[class_idx] += (0.5 * image_map_cls).astype(dtype)
+                    merged_map[class_idx] = non_linear_normalization(merged_map[class_idx])
+            norm_maps.append(merged_map)
+
+        return norm_maps
@@ -32,15 +32,16 @@
 class DetClassProbabilityMapHook(BaseRecordingForwardHook):
     """Saliency map hook for object detection models."""
 
-    def __init__(self, module: torch.nn.Module) -> None:
-        super().__init__(module)
+    def __init__(self, module: torch.nn.Module, normalize: bool = True, use_cls_softmax: bool = True) -> None:
+        super().__init__(module, normalize=normalize)
         self._neck = module.neck if module.with_neck else None
         self._bbox_head = module.bbox_head
         self._num_cls_out_channels = module.bbox_head.cls_out_channels  # SSD-like heads also have background class
         if hasattr(module.bbox_head, "anchor_generator"):
             self._num_anchors = module.bbox_head.anchor_generator.num_base_anchors
         else:
             self._num_anchors = [1] * 10
+        self.use_cls_softmax = use_cls_softmax
 
     def func(
         self,
@@ -58,7 +59,11 @@ def func(
             cls_scores = feature_map
         else:
             cls_scores = self._get_cls_scores_from_feature_map(feature_map)
-        cls_scores = [torch.softmax(t, dim=1) for t in cls_scores]
+
+        # Don't use softmax for tiles in tiling detection, if the tile doesn't contain objects,
+        # it would highlight one of the class maps as a background class
+        if self.use_cls_softmax:
+            cls_scores = [torch.softmax(t, dim=1) for t in cls_scores]
 
         batch_size, _, height, width = cls_scores[-1].size()
         saliency_maps = torch.empty(batch_size, self._num_cls_out_channels, height, width)
@@ -77,12 +82,12 @@ def func(
                 )
             saliency_maps[batch_idx] = torch.cat(cls_scores_anchorless_resized, dim=0).mean(dim=0)
 
-        saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, -1))
-        max_values, _ = torch.max(saliency_maps, -1)
-        min_values, _ = torch.min(saliency_maps, -1)
-        saliency_maps = 255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
+        if self._norm_saliency_maps:
+            saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, -1))
+            saliency_maps = self._normalize_map(saliency_maps)
+
         saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, height, width))
-        saliency_maps = saliency_maps.to(torch.uint8)
+
         return saliency_maps
 
     def _get_cls_scores_from_feature_map(self, x: torch.Tensor) -> List:
@@ -211,11 +216,7 @@ def _get_saliency_maps_from_mask_predictions(
         test_cfg = self._module.roi_head.test_cfg.copy()
         test_cfg["mask_thr_binary"] = -1
 
-        saliency_maps = []  # type: List[List[Optional[np.ndarray]]]
-        for i in range(batch_size):
-            saliency_maps.append([])
-            for j in range(self._module.roi_head.mask_head.num_classes):
-                saliency_maps[i].append(None)
+        saliency_maps = [[None for _ in range(self._module.roi_head.mask_head.num_classes)] for _ in range(batch_size)]
 
         for i in range(batch_size):
             if det_bboxes[i].shape[0] == 0:

@@ -99,7 +99,13 @@ def custom_atss__simple_test(ctx, self, img, img_metas, **kwargs):
         if ctx.cfg["dump_features"]:
             feature_vector = FeatureVectorHook.func(feat)
             cls_scores = outs[0]
-            saliency_map = DetClassProbabilityMapHook(self).func(feature_map=cls_scores, cls_scores_provided=True)
+            postprocess_kwargs = {
+                "normalize": ctx.cfg["normalize_saliency_maps"],
+                "use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
+            }
+            saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
+                feature_map=cls_scores, cls_scores_provided=True
+            )
             return (*bbox_results, feature_vector, saliency_map)
 
         return bbox_results

@@ -90,7 +90,7 @@ def custom_deformable_detr__simple_test(ctx, self, img, img_metas, **kwargs):
         if ctx.cfg["dump_features"]:
             feature_vector = FeatureVectorHook.func(feat)
             cls_scores = outs[0]
-            saliency_map = ActivationMapHook.func(cls_scores)
+            saliency_map = ActivationMapHook(self).func(cls_scores)
             return (*bbox_results, feature_vector, saliency_map)
 
         return bbox_results
@@ -164,7 +164,7 @@ def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs):
 
         if ctx.cfg["dump_features"]:
             feature_vector = FeatureVectorHook.func(feats)
-            saliency_map = ActivationMapHook.func(cls_scores)
+            saliency_map = ActivationMapHook(self).func(cls_scores)
             return (*bbox_results, feature_vector, saliency_map)
 
         return bbox_results
@@ -161,7 +161,13 @@ def custom_single_stage_detector__simple_test(ctx, self, img, img_metas, **kwarg
         if ctx.cfg["dump_features"]:
             feature_vector = FeatureVectorHook.func(feat)
             cls_scores = outs[0]
-            saliency_map = DetClassProbabilityMapHook(self).func(cls_scores, cls_scores_provided=True)
+            postprocess_kwargs = {
+                "normalize": ctx.cfg["normalize_saliency_maps"],
+                "use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
+            }
+            saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
+                cls_scores, cls_scores_provided=True
+            )
             return (*bbox_results, feature_vector, saliency_map)
 
         return bbox_results

@@ -143,7 +143,13 @@ def custom_yolox__simple_test(ctx, self, img, img_metas, **kwargs):
         if ctx.cfg["dump_features"]:
             feature_vector = FeatureVectorHook.func(feat)
             cls_scores = outs[0]
-            saliency_map = DetClassProbabilityMapHook(self).func(cls_scores, cls_scores_provided=True)
+            postprocess_kwargs = {
+                "use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
+                "normalize": ctx.cfg["normalize_saliency_maps"],
+            }
+            saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
+                cls_scores, cls_scores_provided=True
+            )
             return (*bbox_results, feature_vector, saliency_map)
 
         return bbox_results