open-edge-platform · AlbertvanHouten · Apr 2, 2026 · Copilot · Apr 2, 2026
@@ -2,10 +2,12 @@
 #
 # SPDX-License-Identifier: MIT
 
+import logging
+
 import polars as pl
 
 from datumaro.experimental.categories import LabelCategories
-from datumaro.experimental.converters.base import ConversionError, Converter, list_eval_ref
+from datumaro.experimental.converters.base import Converter, list_eval_ref
 from datumaro.experimental.converters.registry import converter
 from datumaro.experimental.fields.annotations import (
     BBoxField,
@@ -18,6 +20,8 @@
 from datumaro.experimental.fields.images import ImageField
 from datumaro.experimental.schema import AttributeSpec
 
+log = logging.getLogger(__name__)
+
 
 @converter
 class LabelIndexConverter(Converter):
@@ -336,8 +340,8 @@ class LabelShapeConverter(Converter):
       - ``is_list=True, multi_label=False`` ↔ ``is_list=False, multi_label=True``:
         both map to ``List(dtype)``; the conversion is a no-op on the data.
 
-    Unsupported (lossy) conversions:
-      - ``List(dtype) → dtype`` (multi_label or is_list reduction): rejected to prevent silent data loss
+    Lossy conversions (keep first element, logs a warning):
+      - ``List(dtype) → dtype`` (multi_label or is_list reduction): keeps the first element only
     """
 
     input_label: AttributeSpec[LabelField]
@@ -359,6 +363,63 @@ def filter_output_spec(self) -> bool:
 
         return input_field.multi_label != output_field.multi_label or input_field.is_list != output_field.is_list
 
+    def _convert_multi_label(
+        self, df: pl.DataFrame, input_col: str, output_col: str, input_is_list: bool
+    ) -> pl.DataFrame:
+        """Reduce multi_label → single-label (inner dimension): keep first label."""
+        log.warning(
+            "Converting multi-label to single-label for field '%s': keeping the first label only. Data may be lost.",
+            input_col,
+        )
+        if input_is_list:
+            # List(List(dtype)) → List(dtype): take first element of each inner list
+            return df.with_columns(pl.col(input_col).list.eval(pl.element().list.first()).alias(output_col))
+        # List(dtype) → dtype: take the first element
+        return df.with_columns(pl.col(input_col).list.first().alias(output_col))
+
+    def _expand_multi_label(
+        self, df: pl.DataFrame, input_col: str, output_col: str, input_is_list: bool
+    ) -> pl.DataFrame:
+        """Expand single-label → multi_label (inner dimension): wrap in list."""
+        if input_is_list:
+            # List(dtype) → List(List(dtype)): wrap each element in the list
+            return df.with_columns(pl.col(input_col).list.eval(pl.concat_list(pl.element())).alias(output_col))
+        # dtype → List(dtype): wrap the scalar value in a list, preserving nulls
+        return df.with_columns(
+            pl.when(pl.col(input_col).is_not_null())
+            .then(pl.concat_list(pl.col(input_col)))
+            .otherwise(pl.lit(None, dtype=pl.List(self.input_label.field.dtype)))
+            .alias(output_col)
+        )
+
+    def _convert_is_list(
+        self, df: pl.DataFrame, src_col: str, output_col: str, input_is_list: bool, output_multi: bool
+    ) -> pl.DataFrame:
+        """Handle is_list conversion (outer dimension)."""
+        if input_is_list:
+            log.warning(
+                "Converting list to non-list for field '%s': keeping the first element only. Data may be lost.",
+                self.input_label.name,
-                self.input_label.name,
+                src_col,
-                self.input_label.name,
+                src_col,
+            )
+            # List(X) → X: take the first element
+            return df.with_columns(pl.col(src_col).list.first().alias(output_col))
+        # X → List(X): wrap each sample's value in a 1-element list, preserving nulls
+        if not output_multi:
+            # Scalar → List(scalar): use native concat_list (fast path)
+            return df.with_columns(
+                pl.when(pl.col(src_col).is_not_null())
+                .then(pl.concat_list(pl.col(src_col)))
+                .otherwise(pl.lit(None, dtype=self.output_label.field._pl_type))
+                .alias(output_col)
+            )
+        # List(X) → List(List(X))
+        target_pl_type = self.output_label.field._pl_type
+        return df.with_columns(
+            pl.col(src_col)
+            .map_elements(lambda x: [x] if x is not None else None, return_dtype=target_pl_type)
+            .alias(output_col)
+        )
+
     def convert(self, df: pl.DataFrame) -> pl.DataFrame:
         """Convert label data between different multi_label/is_list configurations."""
         input_col = self.input_label.name
@@ -382,55 +443,18 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
 
         # Step 1: handle multi_label conversion (inner dimension)
         if input_multi and not output_multi:
-            raise ConversionError(
-                f"Cannot convert multi-label to single-label for field '{input_col}': "
-                "this would discard all labels except the first. "
-                "Please apply an explicit aggregation transform before converting."
-            )
-        if not input_multi and output_multi:
-            if input_is_list:
-                # List(dtype) → List(List(dtype)): wrap each element in the list
-                df = df.with_columns(pl.col(input_col).list.eval(pl.concat_list(pl.element())).alias(output_col))
-            else:
-                # dtype → List(dtype): wrap the scalar value in a list, preserving nulls
-                df = df.with_columns(
-                    pl.when(pl.col(input_col).is_not_null())
-                    .then(pl.concat_list(pl.col(input_col)))
-                    .otherwise(pl.lit(None, dtype=pl.List(self.input_label.field.dtype)))
-                    .alias(output_col)
-                )
+            df = self._convert_multi_label(df, input_col, output_col, input_is_list)
+        elif not input_multi and output_multi:
+            df = self._expand_multi_label(df, input_col, output_col, input_is_list)
 
         # After step 1 the multi_label dimension matches the output.
         # The effective is_list state is still ``input_is_list``.
 
         # Step 2: handle is_list conversion (outer dimension)
-        # Determine the column to read from (may have been updated in step 1)
-        step2_col = output_col if input_multi != output_multi else input_col
-
-        if input_is_list and not output_is_list:
-            raise ConversionError(
-                f"Cannot convert list to non-list for field '{input_col}': "
-                "this would discard all elements except the first. "
-                "Please apply an explicit aggregation transform before converting."
-            )
-        if not input_is_list and output_is_list:
-            # X → List(X): wrap each sample's value in a 1-element list, preserving nulls
-            if not output_multi:
-                # Scalar → List(scalar): use native concat_list (fast path)
-                df = df.with_columns(
-                    pl.when(pl.col(step2_col).is_not_null())
-                    .then(pl.concat_list(pl.col(step2_col)))
-                    .otherwise(pl.lit(None, dtype=self.output_label.field._pl_type))
-                    .alias(output_col)
-                )
-            else:
-                # List(X) → List(List(X))
-                target_pl_type = self.output_label.field._pl_type
-                df = df.with_columns(
-                    pl.col(step2_col)
-                    .map_elements(lambda x: [x] if x is not None else None, return_dtype=target_pl_type)
-                    .alias(output_col)
-                )
+        if input_is_list != output_is_list:
+            # Determine the column to read from (may have been updated in step 1)
+            step2_col = output_col if input_multi != output_multi else input_col
+            df = self._convert_is_list(df, step2_col, output_col, input_is_list, output_multi)
 
         return df
 

@@ -7,13 +7,17 @@
 simple value field types defined in :mod:`datumaro.experimental.fields.types`.
 """
 
+import logging
+
 import polars as pl
 
-from datumaro.experimental.converters.base import ConversionError, Converter
+from datumaro.experimental.converters.base import Converter
 from datumaro.experimental.converters.registry import converter
 from datumaro.experimental.fields.types import BoolField, NumericField, StringField
 from datumaro.experimental.schema import AttributeSpec
 
+log = logging.getLogger(__name__)
+
 # ---------------------------------------------------------------------------
 # NumericField converters
 # ---------------------------------------------------------------------------
@@ -28,9 +32,7 @@ class NumericFieldShapeConverter(Converter):
 
     Supported conversions:
       - ``dtype → List(dtype)``: wrap scalar in a one-element list
-
-    Unsupported (lossy) conversions:
-      - ``List(dtype) → dtype``: rejected to prevent silent data loss
+      - ``List(dtype) → dtype``: keep the first element only (lossy, logs a warning)
     """
 
     input_numeric: AttributeSpec[NumericField]
@@ -63,11 +65,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
         output_is_list = self.output_numeric.field.is_list
 
         if input_is_list and not output_is_list:
-            raise ConversionError(
-                f"Cannot convert list to scalar for numeric field '{input_col}': "
-                "this would discard all elements except the first. "
-                "Please apply an explicit aggregation transform before converting."
+            log.warning(
+                "Converting list to scalar for numeric field '%s': keeping the first element only. Data may be lost.",
+                input_col,
             )
+            df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
         if not input_is_list and output_is_list:
             # dtype → List(dtype): wrap the scalar value in a list, preserving nulls
             df = df.with_columns(
@@ -130,9 +132,7 @@ class BoolFieldShapeConverter(Converter):
 
     Supported conversions:
       - ``Boolean → List(Boolean)``: wrap scalar in a one-element list
-
-    Unsupported (lossy) conversions:
-      - ``List(Boolean) → Boolean``: rejected to prevent silent data loss
+      - ``List(Boolean) → Boolean``: keep the first element only (lossy, logs a warning)
     """
 
     input_bool: AttributeSpec[BoolField]
@@ -164,11 +164,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
         output_is_list = self.output_bool.field.is_list
 
         if input_is_list and not output_is_list:
-            raise ConversionError(
-                f"Cannot convert list to scalar for boolean field '{input_col}': "
-                "this would discard all elements except the first. "
-                "Please apply an explicit aggregation transform before converting."
+            log.warning(
+                "Converting list to scalar for boolean field '%s': keeping the first element only. Data may be lost.",
+                input_col,
             )
+            df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
         if not input_is_list and output_is_list:
             # Boolean → List(Boolean): wrap the scalar value in a list, preserving nulls
             df = df.with_columns(
@@ -195,9 +195,7 @@ class StringFieldShapeConverter(Converter):
 
     Supported conversions:
       - ``String → List(String)``: wrap scalar in a one-element list
-
-    Unsupported (lossy) conversions:
-      - ``List(String) → String``: rejected to prevent silent data loss
+      - ``List(String) → String``: keep the first element only (lossy, logs a warning)
     """
 
     input_string: AttributeSpec[StringField]
@@ -229,11 +227,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
         output_is_list = self.output_string.field.is_list
 
         if input_is_list and not output_is_list:
-            raise ConversionError(
-                f"Cannot convert list to scalar for string field '{input_col}': "
-                "this would discard all elements except the first. "
-                "Please apply an explicit aggregation transform before converting."
+            log.warning(
+                "Converting list to scalar for string field '%s': keeping the first element only. Data may be lost.",
+                input_col,
             )
+            df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
         if not input_is_list and output_is_list:
             # String → List(String): wrap the scalar value in a list, preserving nulls
             df = df.with_columns(

@@ -823,27 +823,28 @@ def test_label_shape_converter_conversions(
 
 # fmt: off
 @pytest.mark.parametrize(
-    "input_multi, input_is_list, output_multi, output_is_list, data, schema_type, error_match",
+    "input_multi, input_is_list, output_multi, output_is_list, data, schema_type, expected_value, expected_dtype",
     [
-        pytest.param(True,  False, False, False, {"label": [[5, 10, 15]]},            pl.List(pl.UInt32()),          "Cannot convert multi-label to single-label", id="multi_to_single_scalar"),  # noqa: E501
-        pytest.param(True,  True,  False, True,  {"label": [[[5, 10], [15, 20]]]},    pl.List(pl.List(pl.UInt32())), "Cannot convert multi-label to single-label", id="multi_to_single_list"),  # noqa: E501
-        pytest.param(False, True,  False, False, {"label": [[10, 20, 30]]},           pl.List(pl.UInt32()),          "Cannot convert list to non-list",             id="list_to_scalar"),  # noqa: E501
-        pytest.param(True,  True,  True,  False, {"label": [[[1, 2], [3, 4], [5]]]},  pl.List(pl.List(pl.UInt32())), "Cannot convert list to non-list",             id="multi_label_list_to_scalar"),  # noqa: E501
-        pytest.param(True,  True,  False, False, {"label": [[[5, 10], [15, 20]]]},    pl.List(pl.List(pl.UInt32())), "Cannot convert multi-label to single-label", id="both_list_list_to_scalar"),  # noqa: E501
+        pytest.param(True,  False, False, False, {"label": [[5, 10, 15]]},            pl.List(pl.UInt32()),          5,          pl.UInt32(),          id="multi_to_single_scalar"),  # noqa: E501
+        pytest.param(True,  True,  False, True,  {"label": [[[5, 10], [15, 20]]]},    pl.List(pl.List(pl.UInt32())), [5, 15],    pl.List(pl.UInt32()), id="multi_to_single_list"),  # noqa: E501
+        pytest.param(False, True,  False, False, {"label": [[10, 20, 30]]},           pl.List(pl.UInt32()),          10,         pl.UInt32(),          id="list_to_scalar"),  # noqa: E501
+        pytest.param(True,  True,  True,  False, {"label": [[[1, 2], [3, 4], [5]]]},  pl.List(pl.List(pl.UInt32())), [1, 2],     pl.List(pl.UInt32()), id="multi_label_list_to_scalar"),  # noqa: E501
+        pytest.param(True,  True,  False, False, {"label": [[[5, 10], [15, 20]]]},    pl.List(pl.List(pl.UInt32())), 5,          pl.UInt32(),          id="both_list_list_to_scalar"),  # noqa: E501
     ],
 )  # fmt: on  noqa: RUF028
-def test_label_shape_converter_rejects_lossy_conversions(
+def test_label_shape_converter_lossy_conversions(
     input_multi,
     input_is_list,
     output_multi,
     output_is_list,
     data,
     schema_type,
-    error_match,
+    expected_value,
+    expected_dtype,
+    caplog,
 ):
-    """convert() raises ConversionError for lossy multi_label/is_list reductions."""
+    """convert() keeps the first element and logs a warning for lossy multi_label/is_list reductions."""
     from datumaro.experimental.converters import LabelShapeConverter
-    from datumaro.experimental.converters.base import ConversionError
 
     converter_instance = LabelShapeConverter()
 
@@ -857,8 +858,18 @@ def test_label_shape_converter_rejects_lossy_conversions(
     assert converter_instance.filter_output_spec() is True
 
     df = pl.DataFrame(data, schema=pl.Schema({col_name: schema_type}))
-    with pytest.raises(ConversionError, match=error_match):
-        converter_instance.convert(df)
+    with caplog.at_level("WARNING"):
+        result_df = converter_instance.convert(df)
+
+    assert result_df[col_name].dtype == expected_dtype
+    result_val = result_df[col_name][0]
+    if hasattr(result_val, "to_list"):
+        assert result_val.to_list() == expected_value
+    else:
+        assert result_val == expected_value
+
+    # Verify a warning was logged
+    assert any("keeping the first" in msg for msg in caplog.messages)
 
 
 def test_label_shape_converter_filter_returns_false():