Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 71 additions & 47 deletions src/datumaro/experimental/converters/annotation_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
#
# SPDX-License-Identifier: MIT

import logging

import polars as pl

from datumaro.experimental.categories import LabelCategories
from datumaro.experimental.converters.base import ConversionError, Converter, list_eval_ref
from datumaro.experimental.converters.base import Converter, list_eval_ref
from datumaro.experimental.converters.registry import converter
from datumaro.experimental.fields.annotations import (
BBoxField,
Expand All @@ -18,6 +20,8 @@
from datumaro.experimental.fields.images import ImageField
from datumaro.experimental.schema import AttributeSpec

log = logging.getLogger(__name__)


@converter
class LabelIndexConverter(Converter):
Expand Down Expand Up @@ -336,8 +340,8 @@ class LabelShapeConverter(Converter):
- ``is_list=True, multi_label=False`` ↔ ``is_list=False, multi_label=True``:
both map to ``List(dtype)``; the conversion is a no-op on the data.

Unsupported (lossy) conversions:
- ``List(dtype) → dtype`` (multi_label or is_list reduction): rejected to prevent silent data loss
Lossy conversions (keep first element, logs a warning):
- ``List(dtype) → dtype`` (multi_label or is_list reduction): keeps the first element only
"""

input_label: AttributeSpec[LabelField]
Expand All @@ -359,6 +363,63 @@ def filter_output_spec(self) -> bool:

return input_field.multi_label != output_field.multi_label or input_field.is_list != output_field.is_list

def _convert_multi_label(
self, df: pl.DataFrame, input_col: str, output_col: str, input_is_list: bool
) -> pl.DataFrame:
"""Reduce multi_label → single-label (inner dimension): keep first label."""
log.warning(
"Converting multi-label to single-label for field '%s': keeping the first label only. Data may be lost.",
input_col,
)
if input_is_list:
# List(List(dtype)) → List(dtype): take first element of each inner list
return df.with_columns(pl.col(input_col).list.eval(pl.element().list.first()).alias(output_col))
# List(dtype) → dtype: take the first element
return df.with_columns(pl.col(input_col).list.first().alias(output_col))

def _expand_multi_label(
self, df: pl.DataFrame, input_col: str, output_col: str, input_is_list: bool
) -> pl.DataFrame:
"""Expand single-label → multi_label (inner dimension): wrap in list."""
if input_is_list:
# List(dtype) → List(List(dtype)): wrap each element in the list
return df.with_columns(pl.col(input_col).list.eval(pl.concat_list(pl.element())).alias(output_col))
# dtype → List(dtype): wrap the scalar value in a list, preserving nulls
return df.with_columns(
pl.when(pl.col(input_col).is_not_null())
.then(pl.concat_list(pl.col(input_col)))
.otherwise(pl.lit(None, dtype=pl.List(self.input_label.field.dtype)))
.alias(output_col)
)

def _convert_is_list(
self, df: pl.DataFrame, src_col: str, output_col: str, input_is_list: bool, output_multi: bool
) -> pl.DataFrame:
"""Handle is_list conversion (outer dimension)."""
if input_is_list:
log.warning(
"Converting list to non-list for field '%s': keeping the first element only. Data may be lost.",
self.input_label.name,
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In _convert_is_list, the warning logs self.input_label.name rather than the column actually being reduced (src_col / step2_col). When multi_label conversion has already written into output_col and step 2 reads from that, the warning can reference the wrong field/column name, which is confusing for users debugging data loss. Consider logging src_col (or the original input_col passed into convert) for consistency with _convert_multi_label.

Suggested change
self.input_label.name,
src_col,

Copilot uses AI. Check for mistakes.
)
# List(X) → X: take the first element
return df.with_columns(pl.col(src_col).list.first().alias(output_col))
# X → List(X): wrap each sample's value in a 1-element list, preserving nulls
if not output_multi:
# Scalar → List(scalar): use native concat_list (fast path)
return df.with_columns(
pl.when(pl.col(src_col).is_not_null())
.then(pl.concat_list(pl.col(src_col)))
.otherwise(pl.lit(None, dtype=self.output_label.field._pl_type))
.alias(output_col)
)
# List(X) → List(List(X))
target_pl_type = self.output_label.field._pl_type
return df.with_columns(
pl.col(src_col)
.map_elements(lambda x: [x] if x is not None else None, return_dtype=target_pl_type)
.alias(output_col)
)

def convert(self, df: pl.DataFrame) -> pl.DataFrame:
"""Convert label data between different multi_label/is_list configurations."""
input_col = self.input_label.name
Expand All @@ -382,55 +443,18 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:

# Step 1: handle multi_label conversion (inner dimension)
if input_multi and not output_multi:
raise ConversionError(
f"Cannot convert multi-label to single-label for field '{input_col}': "
"this would discard all labels except the first. "
"Please apply an explicit aggregation transform before converting."
)
if not input_multi and output_multi:
if input_is_list:
# List(dtype) → List(List(dtype)): wrap each element in the list
df = df.with_columns(pl.col(input_col).list.eval(pl.concat_list(pl.element())).alias(output_col))
else:
# dtype → List(dtype): wrap the scalar value in a list, preserving nulls
df = df.with_columns(
pl.when(pl.col(input_col).is_not_null())
.then(pl.concat_list(pl.col(input_col)))
.otherwise(pl.lit(None, dtype=pl.List(self.input_label.field.dtype)))
.alias(output_col)
)
df = self._convert_multi_label(df, input_col, output_col, input_is_list)
elif not input_multi and output_multi:
df = self._expand_multi_label(df, input_col, output_col, input_is_list)

# After step 1 the multi_label dimension matches the output.
# The effective is_list state is still ``input_is_list``.

# Step 2: handle is_list conversion (outer dimension)
# Determine the column to read from (may have been updated in step 1)
step2_col = output_col if input_multi != output_multi else input_col

if input_is_list and not output_is_list:
raise ConversionError(
f"Cannot convert list to non-list for field '{input_col}': "
"this would discard all elements except the first. "
"Please apply an explicit aggregation transform before converting."
)
if not input_is_list and output_is_list:
# X → List(X): wrap each sample's value in a 1-element list, preserving nulls
if not output_multi:
# Scalar → List(scalar): use native concat_list (fast path)
df = df.with_columns(
pl.when(pl.col(step2_col).is_not_null())
.then(pl.concat_list(pl.col(step2_col)))
.otherwise(pl.lit(None, dtype=self.output_label.field._pl_type))
.alias(output_col)
)
else:
# List(X) → List(List(X))
target_pl_type = self.output_label.field._pl_type
df = df.with_columns(
pl.col(step2_col)
.map_elements(lambda x: [x] if x is not None else None, return_dtype=target_pl_type)
.alias(output_col)
)
if input_is_list != output_is_list:
# Determine the column to read from (may have been updated in step 1)
step2_col = output_col if input_multi != output_multi else input_col
df = self._convert_is_list(df, step2_col, output_col, input_is_list, output_multi)

return df

Expand Down
42 changes: 20 additions & 22 deletions src/datumaro/experimental/converters/type_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
simple value field types defined in :mod:`datumaro.experimental.fields.types`.
"""

import logging

import polars as pl

from datumaro.experimental.converters.base import ConversionError, Converter
from datumaro.experimental.converters.base import Converter
from datumaro.experimental.converters.registry import converter
from datumaro.experimental.fields.types import BoolField, NumericField, StringField
from datumaro.experimental.schema import AttributeSpec

log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# NumericField converters
# ---------------------------------------------------------------------------
Expand All @@ -28,9 +32,7 @@ class NumericFieldShapeConverter(Converter):

Supported conversions:
- ``dtype → List(dtype)``: wrap scalar in a one-element list

Unsupported (lossy) conversions:
- ``List(dtype) → dtype``: rejected to prevent silent data loss
- ``List(dtype) → dtype``: keep the first element only (lossy, logs a warning)
"""

input_numeric: AttributeSpec[NumericField]
Expand Down Expand Up @@ -63,11 +65,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
output_is_list = self.output_numeric.field.is_list

if input_is_list and not output_is_list:
raise ConversionError(
f"Cannot convert list to scalar for numeric field '{input_col}': "
"this would discard all elements except the first. "
"Please apply an explicit aggregation transform before converting."
log.warning(
"Converting list to scalar for numeric field '%s': keeping the first element only. Data may be lost.",
input_col,
)
df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
if not input_is_list and output_is_list:
# dtype → List(dtype): wrap the scalar value in a list, preserving nulls
df = df.with_columns(
Expand Down Expand Up @@ -130,9 +132,7 @@ class BoolFieldShapeConverter(Converter):

Supported conversions:
- ``Boolean → List(Boolean)``: wrap scalar in a one-element list

Unsupported (lossy) conversions:
- ``List(Boolean) → Boolean``: rejected to prevent silent data loss
- ``List(Boolean) → Boolean``: keep the first element only (lossy, logs a warning)
"""

input_bool: AttributeSpec[BoolField]
Expand Down Expand Up @@ -164,11 +164,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
output_is_list = self.output_bool.field.is_list

if input_is_list and not output_is_list:
raise ConversionError(
f"Cannot convert list to scalar for boolean field '{input_col}': "
"this would discard all elements except the first. "
"Please apply an explicit aggregation transform before converting."
log.warning(
"Converting list to scalar for boolean field '%s': keeping the first element only. Data may be lost.",
input_col,
)
df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
if not input_is_list and output_is_list:
# Boolean → List(Boolean): wrap the scalar value in a list, preserving nulls
df = df.with_columns(
Expand All @@ -195,9 +195,7 @@ class StringFieldShapeConverter(Converter):

Supported conversions:
- ``String → List(String)``: wrap scalar in a one-element list

Unsupported (lossy) conversions:
- ``List(String) → String``: rejected to prevent silent data loss
- ``List(String) → String``: keep the first element only (lossy, logs a warning)
"""

input_string: AttributeSpec[StringField]
Expand Down Expand Up @@ -229,11 +227,11 @@ def convert(self, df: pl.DataFrame) -> pl.DataFrame:
output_is_list = self.output_string.field.is_list

if input_is_list and not output_is_list:
raise ConversionError(
f"Cannot convert list to scalar for string field '{input_col}': "
"this would discard all elements except the first. "
"Please apply an explicit aggregation transform before converting."
log.warning(
"Converting list to scalar for string field '%s': keeping the first element only. Data may be lost.",
input_col,
)
df = df.with_columns(pl.col(input_col).list.first().alias(output_col))
if not input_is_list and output_is_list:
# String → List(String): wrap the scalar value in a list, preserving nulls
df = df.with_columns(
Expand Down
35 changes: 23 additions & 12 deletions tests/unit/experimental/converters/test_annotation_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,27 +823,28 @@ def test_label_shape_converter_conversions(

# fmt: off
@pytest.mark.parametrize(
"input_multi, input_is_list, output_multi, output_is_list, data, schema_type, error_match",
"input_multi, input_is_list, output_multi, output_is_list, data, schema_type, expected_value, expected_dtype",
[
pytest.param(True, False, False, False, {"label": [[5, 10, 15]]}, pl.List(pl.UInt32()), "Cannot convert multi-label to single-label", id="multi_to_single_scalar"), # noqa: E501
pytest.param(True, True, False, True, {"label": [[[5, 10], [15, 20]]]}, pl.List(pl.List(pl.UInt32())), "Cannot convert multi-label to single-label", id="multi_to_single_list"), # noqa: E501
pytest.param(False, True, False, False, {"label": [[10, 20, 30]]}, pl.List(pl.UInt32()), "Cannot convert list to non-list", id="list_to_scalar"), # noqa: E501
pytest.param(True, True, True, False, {"label": [[[1, 2], [3, 4], [5]]]}, pl.List(pl.List(pl.UInt32())), "Cannot convert list to non-list", id="multi_label_list_to_scalar"), # noqa: E501
pytest.param(True, True, False, False, {"label": [[[5, 10], [15, 20]]]}, pl.List(pl.List(pl.UInt32())), "Cannot convert multi-label to single-label", id="both_list_list_to_scalar"), # noqa: E501
pytest.param(True, False, False, False, {"label": [[5, 10, 15]]}, pl.List(pl.UInt32()), 5, pl.UInt32(), id="multi_to_single_scalar"), # noqa: E501
pytest.param(True, True, False, True, {"label": [[[5, 10], [15, 20]]]}, pl.List(pl.List(pl.UInt32())), [5, 15], pl.List(pl.UInt32()), id="multi_to_single_list"), # noqa: E501
pytest.param(False, True, False, False, {"label": [[10, 20, 30]]}, pl.List(pl.UInt32()), 10, pl.UInt32(), id="list_to_scalar"), # noqa: E501
pytest.param(True, True, True, False, {"label": [[[1, 2], [3, 4], [5]]]}, pl.List(pl.List(pl.UInt32())), [1, 2], pl.List(pl.UInt32()), id="multi_label_list_to_scalar"), # noqa: E501
pytest.param(True, True, False, False, {"label": [[[5, 10], [15, 20]]]}, pl.List(pl.List(pl.UInt32())), 5, pl.UInt32(), id="both_list_list_to_scalar"), # noqa: E501
],
) # fmt: on noqa: RUF028
def test_label_shape_converter_rejects_lossy_conversions(
def test_label_shape_converter_lossy_conversions(
input_multi,
input_is_list,
output_multi,
output_is_list,
data,
schema_type,
error_match,
expected_value,
expected_dtype,
caplog,
):
"""convert() raises ConversionError for lossy multi_label/is_list reductions."""
"""convert() keeps the first element and logs a warning for lossy multi_label/is_list reductions."""
from datumaro.experimental.converters import LabelShapeConverter
from datumaro.experimental.converters.base import ConversionError

converter_instance = LabelShapeConverter()

Expand All @@ -857,8 +858,18 @@ def test_label_shape_converter_rejects_lossy_conversions(
assert converter_instance.filter_output_spec() is True

df = pl.DataFrame(data, schema=pl.Schema({col_name: schema_type}))
with pytest.raises(ConversionError, match=error_match):
converter_instance.convert(df)
with caplog.at_level("WARNING"):
result_df = converter_instance.convert(df)

assert result_df[col_name].dtype == expected_dtype
result_val = result_df[col_name][0]
if hasattr(result_val, "to_list"):
assert result_val.to_list() == expected_value
else:
assert result_val == expected_value

# Verify a warning was logged
assert any("keeping the first" in msg for msg in caplog.messages)


def test_label_shape_converter_filter_returns_false():
Expand Down
Loading
Loading