MetricsLambda and ClassificationReport ability to use metrics_result_mode. (#3531)

gabrielfruet · vfdev-5 · web-flow · commit fb017ad501e0 · 2026-02-25T10:19:52.000Z
Fixes #3513 The main reason the issue #3513 was created particularly for me was due to the ClassificationReport behavior. So i'm creating this PR to add this functionality following up the #3514 PR. --------- Co-authored-by: vfdev <vfdev.5@gmail.com>
diff --git a/ignite/metrics/classification_report.py b/ignite/metrics/classification_report.py
@@ -1,5 +1,5 @@
 import json
-from typing import Callable, Collection, Dict, List, Optional, Union
+from typing import Callable, Collection, Dict, List, Literal, Optional, Union
 
 import torch
 
@@ -18,6 +18,7 @@ def ClassificationReport(
     device: Union[str, torch.device] = torch.device("cpu"),
     is_multilabel: bool = False,
     labels: Optional[List[str]] = None,
+    metrics_result_mode: Literal["flatten", "named", "both"] = "both",
 ) -> MetricsLambda:
     r"""Build a text report showing the main classification metrics. The report resembles in functionality to
     `scikit-learn classification_report
@@ -34,6 +35,11 @@ def ClassificationReport(
         is_multilabel: If True, the tensors are assumed to be multilabel.
         device: optional device specification for internal storage.
         labels: Optional list of label indices to include in the report
+        metrics_result_mode: specifies how to put the computed metrics results into
+            ``engine.state.metrics`` dictionary. Valid values are: "flatten", "named", "both".
+            - "flatten": if the computed result is a mapping, its keys/values are put directly into the engine state metrics dictionary
+            - "named": if the computed result is a mapping, the whole mapping is put into the engine state metrics dictionary under the metric name
+            - "both": combination of "flatten" and "named".
 
     Examples:
 
@@ -107,6 +113,8 @@ def ClassificationReport(
             {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0}
             {'precision': 0.2333..., 'recall': 0.6666..., 'f1-score': 0.3333...}
 
+    .. versionchanged:: 0.5.4
+        added ``metrics_result_mode`` argument.
     """
 
     # setup all the underlying metrics
@@ -144,4 +152,13 @@ def _wrapper(
     def _get_label_for_class(idx: int) -> str:
         return labels[idx] if labels else str(idx)
 
-    return MetricsLambda(_wrapper, recall, precision, fbeta, averaged_recall, averaged_precision, averaged_fbeta)
+    return MetricsLambda(
+        _wrapper,
+        recall,
+        precision,
+        fbeta,
+        averaged_recall,
+        averaged_precision,
+        averaged_fbeta,
+        metrics_result_mode=metrics_result_mode,
+    )
diff --git a/ignite/metrics/metrics_lambda.py b/ignite/metrics/metrics_lambda.py
@@ -1,5 +1,5 @@
 import itertools
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union, Literal
 
 import torch
 
@@ -24,6 +24,14 @@ class MetricsLambda(Metric):
         f: the function that defines the computation
         args: Sequence of other metrics or something
             else that will be fed to ``f`` as arguments.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+        metrics_result_mode: specifies how to put the computed metrics results into
+            ``engine.state.metrics`` dictionary. Valid values are: "flatten", "named", "both".
+            - "flatten": if the computed result is a mapping, its keys/values are put directly into the engine state metrics dictionary
+            - "named": if the computed result is a mapping, the whole mapping is put into the engine state metrics dictionary under the metric name
+            - "both": combination of "flatten" and "named".
         kwargs: Sequence of other metrics or something
             else that will be fed to ``f`` as keyword arguments.
 
@@ -88,17 +96,27 @@ def Fbeta(r, p, beta):
             assert not aP.is_attached(engine)
             # fully attached
             assert not precision.is_attached(engine)
+
+    .. versionchanged:: 0.5.4
+        added ``skip_unrolling`` and ``metrics_result_mode`` arguments.
     """
 
     _state_dict_all_req_keys = ("_updated", "args", "kwargs")
 
-    def __init__(self, f: Callable, *args: Any, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        f: Callable,
+        *args: Any,
+        skip_unrolling: bool = False,
+        metrics_result_mode: Literal["flatten", "named", "both"] = "both",
+        **kwargs: Any,
+    ) -> None:
         self.function = f
         self.args = list(args)  # we need args to be a list instead of a tuple for state_dict/load_state_dict feature
         self.kwargs = kwargs
         self.engine: Optional[Engine] = None
         self._updated = False
-        super().__init__(device="cpu")
+        super().__init__(device="cpu", metrics_result_mode=metrics_result_mode, skip_unrolling=skip_unrolling)
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py
@@ -7,6 +7,7 @@
 
 import ignite.distributed as idist
 from ignite.engine import Engine
+from ignite.metrics import MetricsLambda
 from ignite.metrics.classification_report import ClassificationReport
 
 
@@ -157,6 +158,23 @@ def _test_integration_multiclass(device, output_dict):
                     _test_multiclass(metric_device, n_classes, output_dict, labels=labels[:n_classes], distributed=True)
 
 
+@pytest.mark.parametrize(
+    "metrics_result_mode",
+    [
+        "flatten",
+        "named",
+        "both",
+    ],
+)
+def test_metrics_result_mode(metrics_result_mode):
+    metric = ClassificationReport(output_dict=True, metrics_result_mode=metrics_result_mode)
+
+    assert isinstance(metric, MetricsLambda), "ClassificationReport should be an instance of MetricsLambda"
+    assert (
+        metric._metrics_result_mode == metrics_result_mode
+    ), f"Expected metrics_result_mode to be {metrics_result_mode}"
+
+
 def _test_integration_multilabel(device, output_dict):
     rank = idist.get_rank()
 
@@ -197,7 +215,6 @@ def test_compute_multilabel(n_times, available_device):
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 @pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="Skip if < 1.7.0")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     pytest.skip("Temporarily skip failing test. See https://github.com/pytorch/ignite/pull/3301")
     # When run with 2 devices:
     #  tests/ignite/metrics/test_classification_report.py::test_distrib_nccl_gpu Fatal Python error: Aborted
diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py
@@ -299,6 +299,51 @@ def test_load_state_dict():
     assert e2 == e
 
 
+@pytest.mark.parametrize(
+    "metrics_result_mode",
+    [
+        "flatten",
+        "named",
+        "both",
+    ],
+)
+def test_metrics_lambda_result_mode_behavior(metrics_result_mode):
+    # dummy for now
+    def dummy_compute_fn(*args, **kwargs):
+        return {
+            "precision": 0.5,
+            "recall": 0.5,
+            "f1-score": 0.5,
+        }
+
+    class DummyMetric(Metric):
+        def __init__(self, output_transform=lambda x: x):
+            super().__init__(output_transform=output_transform, metrics_result_mode=metrics_result_mode)
+
+        def reset(self): ...
+
+        def update(self, output): ...
+
+        def compute(self):
+            return dummy_compute_fn()
+
+    metric_a = MetricsLambda(dummy_compute_fn, metrics_result_mode=metrics_result_mode)
+    metric_b = DummyMetric()
+
+    engine_a = Engine(lambda e, b: b)
+    metric_a.attach(engine_a, "dummy_metric")
+
+    engine_b = Engine(lambda e, b: b)
+    metric_b.attach(engine_b, "dummy_metric")
+
+    state_a = engine_a.run([0], max_epochs=1)
+    state_b = engine_b.run([0], max_epochs=1)
+
+    assert state_a.metrics.keys() == state_b.metrics.keys()
+
+    assert state_a.metrics == state_b.metrics
+
+
 def test_state_metrics():
     y_pred = torch.randint(0, 2, size=(15, 10, 4)).float()
     y = torch.randint(0, 2, size=(15, 10, 4)).long()