Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1221>)
- Fix Kinetics data format to have media data
(<https://github.com/openvinotoolkit/datumaro/pull/1223>)
- Handling undefined labels at the annotation statistics
(<https://github.com/openvinotoolkit/datumaro/pull/1232>)

## 16/11/2023 - Release 1.5.1
### Enhancements
Expand Down
24 changes: 20 additions & 4 deletions src/datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hashlib
import logging as log
import warnings
from collections import defaultdict
from copy import deepcopy
from typing import Callable, Dict, Optional, Set, Tuple

Expand Down Expand Up @@ -225,10 +226,20 @@ def _extractor_stats(subset_name):


def compute_ann_statistics(dataset: IDataset):
labels = dataset.categories().get(AnnotationType.label, LabelCategories())
warnings.warn(
"We are planning to change the type of stats['annotations']['labels']['distribution'] "
"and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. "
"If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.",
FutureWarning,
)
labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())

def get_label(ann):
return labels.items[ann.label].name if ann.label is not None else None
try:
return labels.items[ann.label].name if ann.label is not None else None
except IndexError:
log.warning(f"annotation({ann}) has undefined label({ann.label})")
return ann.label

stats = {
"images count": 0,
Expand All @@ -253,21 +264,26 @@ def get_label(ann):
}
label_stat = {
"count": 0,
"distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
"attributes": {},
}

stats["annotations"]["labels"] = label_stat
segm_stat = {
"avg. area": 0,
"area distribution": [], # a histogram with 10 bins
# (min, min+10%), ..., (min+90%, max) -> (count, total%)
"pixel distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"pixel distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
}
stats["annotations"]["segments"] = segm_stat
segm_areas = []
pixel_dist = segm_stat["pixel distribution"]
total_pixels = 0

for l in labels.items:
label_stat["distribution"][l.name] = [0, 0]
pixel_dist[l.name] = [0, 0]

for item in dataset:
if len(item.annotations) == 0:
stats["unannotated images"].append(item.id)
Expand Down
3 changes: 3 additions & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class Requirements:
DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
DATUM_BUG_721 = "Explain command cannot find the model"
DATUM_BUG_873 = "Error using datum stats"
DATUM_BUG_1204 = (
"Statistics raise an error when there is a label annotation not in the category"
)


class SkipMessages:
Expand Down
302 changes: 301 additions & 1 deletion tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@
import numpy as np
import pytest

from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
from datumaro.components.dataset import Dataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.errors import DatumaroError
from datumaro.components.media import Image, PointCloud
from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
from datumaro.components.operations import (
IMAGE_STATS_SCHEMA,
compute_ann_statistics,
compute_image_statistics,
)

from tests.requirements import Requirements, mark_requirement

Expand Down Expand Up @@ -109,3 +114,298 @@ def test_invalid_media_type(
with pytest.warns(UserWarning, match="only Image media_type is allowed"):
actual = compute_image_statistics(fxt_point_cloud_dataset)
assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]


class AnnStatisticsTest:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats(self):
dataset = Dataset.from_iterable(
[
DatasetItem(
id=1,
media=Image.from_numpy(data=np.ones((5, 5, 3))),
annotations=[
Caption("hello"),
Caption("world"),
Label(
2,
attributes={
"x": 1,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=2,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 1,
"y": "3",
"occluded": True,
},
),
Points([1, 2, 2, 0, 1, 1], label=0),
Mask(
label=3,
image=np.array(
[
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
),
),
],
),
DatasetItem(
id=2,
media=Image.from_numpy(data=np.ones((2, 4, 3))),
annotations=[
Label(
2,
attributes={
"x": 2,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=3,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
Ellipse(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
],
),
DatasetItem(id=3),
DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
],
categories=["label_%s" % i for i in range(4)],
)

expected = {
"images count": 4,
"annotations count": 11,
"unannotated images count": 2,
"unannotated images": ["3", "2.2"],
"annotations by type": {
"label": {
"count": 2,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 4,
},
"mask": {
"count": 1,
},
"points": {
"count": 1,
},
"caption": {
"count": 2,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 1},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 6,
"distribution": {
"label_0": [1, 1 / 6],
"label_1": [0, 0.0],
"label_2": [3, 3 / 6],
"label_3": [2, 2 / 6],
},
"attributes": {
"x": {
"count": 2, # annotations with no label are skipped
"values count": 2,
"values present": ["1", "2"],
"distribution": {
"1": [1, 1 / 2],
"2": [1, 1 / 2],
},
},
"y": {
"count": 2, # annotations with no label are skipped
"values count": 1,
"values present": ["2"],
"distribution": {
"2": [2, 2 / 2],
},
},
# must not include "special" attributes like "occluded"
},
},
"segments": {
"avg. area": (4 * 2 + 9 * 1) / 3,
"area distribution": [
{"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
{"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
{"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
{"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
{"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
{"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
{"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
{"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
{"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
{"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
],
"pixel distribution": {
"label_0": [0, 0.0],
"label_1": [0, 0.0],
"label_2": [4, 4 / 17],
"label_3": [13, 13 / 17],
},
},
},
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats_with_empty_dataset(self):
label_names = ["label_%s" % i for i in range(4)]
dataset = Dataset.from_iterable(
[
DatasetItem(id=1),
DatasetItem(id=3),
],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 2
expected["unannotated images count"] = 2
expected["unannotated images"] = ["1", "3"]

actual = compute_ann_statistics(dataset)
assert actual == expected

@mark_requirement(Requirements.DATUM_BUG_1204)
def test_stats_with_invalid_label(self):
label_names = ["label_%s" % i for i in range(3)]
dataset = Dataset.from_iterable(
iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 4
expected["annotations count"] = 4
expected["annotations by type"]["label"]["count"] = 4
expected["annotations"]["labels"]["count"] = 4
expected["annotations"]["labels"]["distribution"] = {
"label_0": [1, 0.25],
"label_1": [1, 0.25],
"label_2": [1, 0.25],
3: [1, 0.25], # label which does not exist in categories.
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@staticmethod
def _get_stats_template(label_names: list):
return {
"images count": 0,
"annotations count": 0,
"unannotated images count": 0,
"unannotated images": [],
"annotations by type": {
"label": {
"count": 0,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 0,
},
"mask": {
"count": 0,
},
"points": {
"count": 0,
},
"caption": {
"count": 0,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 0},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 0,
"distribution": {n: [0, 0] for n in label_names},
"attributes": {},
},
"segments": {
"avg. area": 0.0,
"area distribution": [],
"pixel distribution": {n: [0, 0] for n in label_names},
},
},
}
Loading