Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
- Support encrypted dataset training (<https://github.com/openvinotoolkit/training_extensions/pull/2209>)
- Add custom max iou assigner to prevent CPU OOM when large annotations are used (<https://github.com/openvinotoolkit/training_extensions/pull/2228>)
- Auto train type detection for Semi-SL, Self-SL and Incremental: "--train-type" now is optional (https://github.com/openvinotoolkit/training_extensions/pull/2195)
- Add per-class XAI saliency maps for Mask R-CNN model (https://github.com/openvinotoolkit/training_extensions/pull/2227)
- Add new object detector Deformable DETR (<https://github.com/openvinotoolkit/training_extensions/pull/2249>)
- Add new object detecotr DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2266>)
- Add new visual prompting task (https://github.com/openvinotoolkit/training_extensions/pull/2203)
Expand All @@ -20,6 +21,7 @@ All notable changes to this project will be documented in this file.
- Set persistent_workers and pin_memory as True in detection task (<https://github.com/openvinotoolkit/training_extensions/pull/2224>)
- New algorithm for Semi-SL semantic segmentation based on metric lerning via class prototypes (https://github.com/openvinotoolkit/training_extensions/pull/2156)
- Self-SL for classification now can recieve just folder with any images to start contrastive pretraining (https://github.com/openvinotoolkit/training_extensions/pull/2219)
- Improve XAI saliency map generation for tiling detection and tiling instance segmentation (https://github.com/openvinotoolkit/training_extensions/pull/2240)

### Bug fixes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,15 @@ class BaseRecordingForwardHook(ABC):
module (torch.nn.Module): The PyTorch module to be registered in forward pass
fpn_idx (int, optional): The layer index to be processed if the model is a FPN.
Defaults to 0 which uses the largest feature map from FPN.
normalize (bool): Whether to normalize the resulting saliency maps.
"""

def __init__(self, module: torch.nn.Module, fpn_idx: int = -1) -> None:
def __init__(self, module: torch.nn.Module, fpn_idx: int = -1, normalize: bool = True) -> None:
self._module = module
self._handle = None
self._records: List[torch.Tensor] = []
self._fpn_idx = fpn_idx
self._norm_saliency_maps = normalize

@property
def records(self):
Expand Down Expand Up @@ -97,12 +99,23 @@ def __exit__(self, exc_type, exc_value, traceback):
"""Exit."""
self._handle.remove()

def _normalize_map(self, saliency_maps: torch.Tensor) -> torch.Tensor:
"""Normalize saliency maps."""
max_values, _ = torch.max(saliency_maps, -1)
min_values, _ = torch.min(saliency_maps, -1)
if len(saliency_maps.shape) == 2:
saliency_maps = 255 * (saliency_maps - min_values[:, None]) / (max_values - min_values + 1e-12)[:, None]
else:
saliency_maps = (
255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
)
return saliency_maps.to(torch.uint8)


class EigenCamHook(BaseRecordingForwardHook):
"""EigenCamHook."""

@staticmethod
def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
"""Generate the saliency map."""
if isinstance(feature_map, (list, tuple)):
feature_map = feature_map[fpn_idx]
Expand All @@ -112,20 +125,19 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int
reshaped_fmap = x.reshape((batch_size, channel, h * w)).transpose(1, 2)
reshaped_fmap = reshaped_fmap - reshaped_fmap.mean(1)[:, None, :]
_, _, vh = torch.linalg.svd(reshaped_fmap, full_matrices=True) # pylint: disable=invalid-name
saliency_map = (reshaped_fmap @ vh[:, 0][:, :, None]).squeeze(-1)
max_values, _ = torch.max(saliency_map, -1)
min_values, _ = torch.min(saliency_map, -1)
saliency_map = 255 * (saliency_map - min_values[:, None]) / ((max_values - min_values + 1e-12)[:, None])

if self._norm_saliency_maps:
saliency_map = (reshaped_fmap @ vh[:, 0][:, :, None]).squeeze(-1)
self._normalize_map(saliency_map)

saliency_map = saliency_map.reshape((batch_size, h, w))
saliency_map = saliency_map.to(torch.uint8)
return saliency_map


class ActivationMapHook(BaseRecordingForwardHook):
"""ActivationMapHook."""

@staticmethod
def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int = -1) -> torch.Tensor:
"""Generate the saliency map by average feature maps then normalizing to (0, 255)."""
if isinstance(feature_map, (list, tuple)):
assert fpn_idx < len(
Expand All @@ -135,12 +147,12 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int

batch_size, _, h, w = feature_map.size()
activation_map = torch.mean(feature_map, dim=1)
activation_map = activation_map.reshape((batch_size, h * w))
max_values, _ = torch.max(activation_map, -1)
min_values, _ = torch.min(activation_map, -1)
activation_map = 255 * (activation_map - min_values[:, None]) / (max_values - min_values + 1e-12)[:, None]

if self._norm_saliency_maps:
activation_map = activation_map.reshape((batch_size, h * w))
activation_map = self._normalize_map(activation_map)

activation_map = activation_map.reshape((batch_size, h, w))
activation_map = activation_map.to(torch.uint8)
return activation_map


Expand Down Expand Up @@ -193,12 +205,11 @@ def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx
mosaic_prediction = self._predict_from_feature_map(mosaic_feature_map)
saliency_maps[f] = mosaic_prediction.transpose(0, 1).reshape((self._num_classes, h, w))

saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h * w))
max_values, _ = torch.max(saliency_maps, -1)
min_values, _ = torch.min(saliency_maps, -1)
saliency_maps = 255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
if self._norm_saliency_maps:
saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h * w))
saliency_maps = self._normalize_map(saliency_maps)

saliency_maps = saliency_maps.reshape((batch_size, self._num_classes, h, w))
saliency_maps = saliency_maps.to(torch.uint8)
return saliency_maps

def _predict_from_feature_map(self, x: torch.Tensor) -> torch.Tensor:
Expand Down
41 changes: 41 additions & 0 deletions src/otx/algorithms/detection/adapters/mmdet/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,44 @@ def merge(self, results) -> Union[List[Tuple[np.ndarray, list]], List[np.ndarray
merged_results (list[list | tuple]): Merged results of the dataset.
"""
return self.tile_dataset.merge(results)

def merge_vectors(self, feature_vectors: List[np.ndarray], dump_vectors: bool) -> Union[np.ndarray, List[None]]:
"""Merge tile-level feature vectors to image-level feature-vector.

Args:
feature_vectors (list[np.ndarray]): tile-level feature vectors.
dump_vectors (bool): whether to dump vectors.

Returns:
merged_vectors (np.ndarray | List[None]): Merged vector for each image.
"""

if dump_vectors:
return self.tile_dataset.merge_vectors(feature_vectors)
else:
return [None] * self.num_samples

def merge_maps(self, saliency_maps: List, dump_maps: bool) -> List:
"""Merge tile-level saliency maps to image-level saliency map.

Args:
saliency_maps (list[list | np.ndarray]): tile-level saliency maps.
dump_maps (bool): whether to dump saliency maps.

Returns:
merged_maps (List[list | np.ndarray | None]): Merged saliency map for each image.
"""

if dump_maps:
if not (np.array(saliency_maps) == None).all(): # noqa
return self.tile_dataset.merge_maps(saliency_maps)
else:
# retutn None for each class for each image
return saliency_maps[: self.num_samples]
else:
return [None] * self.num_samples

def __del__(self):
"""Delete the temporary directory when the object is deleted."""
if getattr(self, "tmp_dir", False):
self.tmp_dir.cleanup()
90 changes: 90 additions & 0 deletions src/otx/algorithms/detection/adapters/mmdet/datasets/tiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
from time import time
from typing import Callable, Dict, List, Tuple, Union

import cv2
import numpy as np
from mmcv.ops import nms
from mmdet.core import BitmapMasks, bbox2result
from tqdm import tqdm

from otx.api.utils.dataset_utils import non_linear_normalization


def timeit(func) -> Callable:
"""Decorator to measure time of function execution.
Expand Down Expand Up @@ -471,3 +474,90 @@ def get_ann_info(self, idx):
if "gt_labels" in self.tiles[idx]:
ann["labels"] = self.tiles[idx]["gt_labels"]
return ann

def merge_vectors(self, feature_vectors: List[np.ndarray]) -> np.ndarray:
"""Merge tile-level feature vectors to image-level feature vector.

Args:
feature_vectors (List[np.ndarray]): tile-level feature vectors.

Returns:
merged_vectors (List[np.ndarray]): Merged vectors for each image.
"""

vect_per_image = len(feature_vectors) // self.num_images
# split vectors on chunks of vectors related to the same image
image_vectors = [
feature_vectors[x : x + vect_per_image] for x in range(0, len(feature_vectors), vect_per_image)
]
return np.average(image_vectors, axis=1)

def merge_maps(self, saliency_maps: Union[List[List[np.ndarray]], List[np.ndarray]]) -> List:
"""Merge tile-level saliency maps to image-level saliency map.

Args:
saliency_maps (List[List[np.array] | np.ndarray]): tile-level saliency maps.
Each map is a list of maps for each detected class or None if class wasn't detected.

Returns:
merged_maps (List[list | np.ndarray | None]): Merged saliency maps for each image.
"""
merged_maps = []
ratios = {}
num_classes = len(saliency_maps[0])
feat_h, feat_w = saliency_maps[0][0].shape
dtype = saliency_maps[0][0][0].dtype

for orig_image in self.cached_results:
img_idx = orig_image["index"]
ratios[img_idx] = np.array([feat_h, feat_w]) / self.tile_size
image_h, image_w = orig_image["height"], orig_image["width"]

image_map_h = int(image_h * ratios[img_idx][0])
image_map_w = int(image_w * ratios[img_idx][1])
merged_maps.append([np.zeros((image_map_h, image_map_w)) for _ in range(num_classes)])

for map, tile in zip(saliency_maps[self.num_images :], self.tiles[self.num_images :]):
for class_idx in range(num_classes):
if map[class_idx] is None:
continue
cls_map = map[class_idx]
img_idx = tile["dataset_idx"]
x_1, y_1, x_2, y_2 = tile["tile_box"]
y_1, x_1 = ((y_1, x_1) * ratios[img_idx]).astype(np.uint16)
y_2, x_2 = ((y_2, x_2) * ratios[img_idx]).astype(np.uint16)

map_h, map_w = cls_map.shape
# resize feature map if it got from the tile which width and height is less the tile_size
if (map_h > y_2 - y_1 > 0) and (map_w > x_2 - x_1 > 0):
cls_map = cv2.resize(cls_map, (x_2 - x_1, y_2 - y_1))
# cut the rest of the feature map that went out of the image borders
map_h, map_w = y_2 - y_1, x_2 - x_1

for hi, wi in [(h_, w_) for h_ in range(map_h) for w_ in range(map_w)]:
map_pixel = cls_map[hi, wi]
# on tile overlap add 0.5 value of each tile
if merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] != 0:
merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] = 0.5 * (
map_pixel + merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi]
)
else:
merged_maps[img_idx][class_idx][y_1 + hi, x_1 + wi] = map_pixel

norm_maps = []
for merged_map, image_sal_map in zip(merged_maps, saliency_maps[: self.num_images]):
for class_idx in range(num_classes):
# don't have detections for this class on merged map
if (merged_map[class_idx] == 0).all():
merged_map[class_idx] = None
else:
image_map_cls = image_sal_map[class_idx]
# resize the feature map for whole image to add it to merged saliency maps
if image_map_cls is not None:
map_h, map_w = merged_map[class_idx].shape
image_map_cls = cv2.resize(image_map_cls, (map_w, map_h))
merged_map[class_idx] += (0.5 * image_map_cls).astype(dtype)
merged_map[class_idx] = non_linear_normalization(merged_map[class_idx])
norm_maps.append(merged_map)

return norm_maps
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@
class DetClassProbabilityMapHook(BaseRecordingForwardHook):
"""Saliency map hook for object detection models."""

def __init__(self, module: torch.nn.Module) -> None:
super().__init__(module)
def __init__(self, module: torch.nn.Module, normalize: bool = True, use_cls_softmax: bool = True) -> None:
super().__init__(module, normalize=normalize)
self._neck = module.neck if module.with_neck else None
self._bbox_head = module.bbox_head
self._num_cls_out_channels = module.bbox_head.cls_out_channels # SSD-like heads also have background class
if hasattr(module.bbox_head, "anchor_generator"):
self._num_anchors = module.bbox_head.anchor_generator.num_base_anchors
else:
self._num_anchors = [1] * 10
self.use_cls_softmax = use_cls_softmax

def func(
self,
Expand All @@ -58,7 +59,11 @@ def func(
cls_scores = feature_map
else:
cls_scores = self._get_cls_scores_from_feature_map(feature_map)
cls_scores = [torch.softmax(t, dim=1) for t in cls_scores]

# Don't use softmax for tiles in tiling detection, if the tile doesn't contain objects,
# it would highlight one of the class maps as a background class
if self.use_cls_softmax:
cls_scores = [torch.softmax(t, dim=1) for t in cls_scores]

batch_size, _, height, width = cls_scores[-1].size()
saliency_maps = torch.empty(batch_size, self._num_cls_out_channels, height, width)
Expand All @@ -77,12 +82,12 @@ def func(
)
saliency_maps[batch_idx] = torch.cat(cls_scores_anchorless_resized, dim=0).mean(dim=0)

saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, -1))
max_values, _ = torch.max(saliency_maps, -1)
min_values, _ = torch.min(saliency_maps, -1)
saliency_maps = 255 * (saliency_maps - min_values[:, :, None]) / (max_values - min_values + 1e-12)[:, :, None]
if self._norm_saliency_maps:
saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, -1))
saliency_maps = self._normalize_map(saliency_maps)

saliency_maps = saliency_maps.reshape((batch_size, self._num_cls_out_channels, height, width))
saliency_maps = saliency_maps.to(torch.uint8)

return saliency_maps

def _get_cls_scores_from_feature_map(self, x: torch.Tensor) -> List:
Expand Down Expand Up @@ -211,11 +216,7 @@ def _get_saliency_maps_from_mask_predictions(
test_cfg = self._module.roi_head.test_cfg.copy()
test_cfg["mask_thr_binary"] = -1

saliency_maps = [] # type: List[List[Optional[np.ndarray]]]
for i in range(batch_size):
saliency_maps.append([])
for j in range(self._module.roi_head.mask_head.num_classes):
saliency_maps[i].append(None)
saliency_maps = [[None for _ in range(self._module.roi_head.mask_head.num_classes)] for _ in range(batch_size)]

for i in range(batch_size):
if det_bboxes[i].shape[0] == 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,13 @@ def custom_atss__simple_test(ctx, self, img, img_metas, **kwargs):
if ctx.cfg["dump_features"]:
feature_vector = FeatureVectorHook.func(feat)
cls_scores = outs[0]
saliency_map = DetClassProbabilityMapHook(self).func(feature_map=cls_scores, cls_scores_provided=True)
postprocess_kwargs = {
"normalize": ctx.cfg["normalize_saliency_maps"],
"use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
}
saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
feature_map=cls_scores, cls_scores_provided=True
)
return (*bbox_results, feature_vector, saliency_map)

return bbox_results
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def custom_deformable_detr__simple_test(ctx, self, img, img_metas, **kwargs):
if ctx.cfg["dump_features"]:
feature_vector = FeatureVectorHook.func(feat)
cls_scores = outs[0]
saliency_map = ActivationMapHook.func(cls_scores)
saliency_map = ActivationMapHook(self).func(cls_scores)
return (*bbox_results, feature_vector, saliency_map)

return bbox_results
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs):

if ctx.cfg["dump_features"]:
feature_vector = FeatureVectorHook.func(feats)
saliency_map = ActivationMapHook.func(cls_scores)
saliency_map = ActivationMapHook(self).func(cls_scores)
return (*bbox_results, feature_vector, saliency_map)

return bbox_results
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,13 @@ def custom_single_stage_detector__simple_test(ctx, self, img, img_metas, **kwarg
if ctx.cfg["dump_features"]:
feature_vector = FeatureVectorHook.func(feat)
cls_scores = outs[0]
saliency_map = DetClassProbabilityMapHook(self).func(cls_scores, cls_scores_provided=True)
postprocess_kwargs = {
"normalize": ctx.cfg["normalize_saliency_maps"],
"use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
}
saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
cls_scores, cls_scores_provided=True
)
return (*bbox_results, feature_vector, saliency_map)

return bbox_results
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,13 @@ def custom_yolox__simple_test(ctx, self, img, img_metas, **kwargs):
if ctx.cfg["dump_features"]:
feature_vector = FeatureVectorHook.func(feat)
cls_scores = outs[0]
saliency_map = DetClassProbabilityMapHook(self).func(cls_scores, cls_scores_provided=True)
postprocess_kwargs = {
"use_cls_softmax": ctx.cfg["softmax_saliency_maps"],
"normalize": ctx.cfg["normalize_saliency_maps"],
}
saliency_map = DetClassProbabilityMapHook(self, **postprocess_kwargs).func(
cls_scores, cls_scores_provided=True
)
return (*bbox_results, feature_vector, saliency_map)

return bbox_results
Expand Down
Loading