Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3848,7 +3848,13 @@ def add_column(self, name: str, column: Union[list, np.array]) -> "IterableDatas
Returns:
`IterableDataset`
"""
return self.map(partial(add_column_fn, name=name, column=column), with_indices=True)
# Preserve existing features and extend them with the new column's inferred type.
# Without this, map() would set info.features=None (its default), losing all schema info.
new_features = None
if self._info.features is not None:
column_features = _infer_features_from_batch({name: list(column)})
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could help avoid unnecessary data copies, since _infer_features_from_batch does copy all the data into arrow

Suggested change
column_features = _infer_features_from_batch({name: list(column)})
column_features = _infer_features_from_batch({name: list(column[:config.DEFAULT_MAX_BATCH_SIZE])})

new_features = Features({**self._info.features, **column_features})
return self.map(partial(add_column_fn, name=name, column=column), with_indices=True, features=new_features)

def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset":
"""
Expand Down
Loading