Skip to content

Commit 6a612ef

Browse files
authored
Enable stream exporters: VOC, YOLO, Datumaro, and COCO data format (#1102)
- Ticket no. 114762 Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
1 parent 32fa404 commit 6a612ef

17 files changed

Lines changed: 469 additions & 134 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
5353
- Migrate DVC v3.0.0
5454
(<https://github.com/openvinotoolkit/datumaro/pull/1072>)
5555
- Stream dataset import/export
56-
(<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>, <https://github.com/openvinotoolkit/datumaro/pull/1091>, <https://github.com/openvinotoolkit/datumaro/pull/1093>, <https://github.com/openvinotoolkit/datumaro/pull/1098>)
56+
(<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>, <https://github.com/openvinotoolkit/datumaro/pull/1091>, <https://github.com/openvinotoolkit/datumaro/pull/1093>, <https://github.com/openvinotoolkit/datumaro/pull/1098>, <https://github.com/openvinotoolkit/datumaro/pull/1102>)
5757
- Support mask annotations for CVAT data format
5858
(<https://github.com/openvinotoolkit/datumaro/pull/1078>)
5959

src/datumaro/components/dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,7 @@ def export(
587587

588588
assert "ctx" not in kwargs
589589
exporter_kwargs = copy(kwargs)
590+
exporter_kwargs["stream"] = self._stream
590591
exporter_kwargs["ctx"] = ExportContext(
591592
progress_reporter=progress_reporter, error_policy=error_policy
592593
)
@@ -632,7 +633,8 @@ def export(
632633
raise e.__cause__
633634

634635
self.bind(save_dir, format, options=copy(kwargs))
635-
self.flush_changes()
636+
if not self._stream:
637+
self.flush_changes()
636638

637639
def save(self, save_dir: Optional[str] = None, **kwargs) -> None:
638640
options = dict(self._options)

src/datumaro/components/dataset_storage.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -612,9 +612,9 @@ def __init__(
612612
):
613613
if not source.is_stream:
614614
raise ValueError("source should be a stream.")
615-
super().__init__(source, infos, categories, media_type)
616-
self._subset_names = None
615+
self._subset_names = list(source.subsets().keys())
617616
self._transform_ids_for_latest_subset_names = []
617+
super().__init__(source, infos, categories, media_type)
618618

619619
def is_cache_initialized(self) -> bool:
620620
log.debug("This function has no effect on streaming.")
@@ -660,12 +660,9 @@ def get_subset(self, name: str) -> IDataset:
660660

661661
@property
662662
def subset_names(self):
663-
if self._subset_names is None:
664-
self._subset_names = {item.subset for item in self}
665-
self._transforms_for_latest_subset_names = [id(t) for t in self._transforms]
666-
elif self._transforms_for_latest_subset_names != [id(t) for t in self._transforms]:
663+
if self._transform_ids_for_latest_subset_names != [id(t) for t in self._transforms]:
667664
self._subset_names = {item.subset for item in self}
668-
self._transforms_for_latest_subset_names = [id(t) for t in self._transforms]
665+
self._transform_ids_for_latest_subset_names = [id(t) for t in self._transforms]
669666

670667
return self._subset_names
671668

src/datumaro/components/exporter.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def __init__(
182182
default_image_ext: Optional[str] = None,
183183
save_dataset_meta: bool = False,
184184
save_hashkey_meta: bool = False,
185+
stream: bool = False,
185186
ctx: Optional[ExportContext] = None,
186187
):
187188
default_image_ext = default_image_ext or self.DEFAULT_IMAGE_EXT
@@ -222,6 +223,12 @@ def __init__(
222223
else:
223224
self._patch = None
224225

226+
if stream and not self.can_stream:
227+
raise DatasetExportError(
228+
f"{self.__class__.__name__} cannot export a dataset in a stream manner"
229+
)
230+
self._stream = stream
231+
225232
self._ctx: ExportContext = ctx or NullExportContext()
226233

227234
def _find_image_ext(self, item: Union[DatasetItem, Image]):
@@ -299,6 +306,11 @@ def _check_hash_key_existence(self, item):
299306
self._save_hashkey_meta = True
300307
return
301308

309+
@property
310+
def can_stream(self) -> bool:
311+
"""Flag to indicate whether the exporter can export the dataset in a stream manner or not."""
312+
return False
313+
302314

303315
# TODO: Currently, ExportContextComponent is introduced only for Datumaro and DatumaroBinary format
304316
# for multi-processing. We need to propagate this to everywhere in Datumaro 1.2.0

src/datumaro/plugins/data_formats/arrow/exporter.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,13 @@ def __init__(
5858
num_shards: int = 1,
5959
max_shard_size: Optional[int] = None,
6060
):
61-
super().__init__(context, "", export_context)
61+
super().__init__(
62+
context=context,
63+
subset=subset,
64+
ann_file="",
65+
export_context=export_context,
66+
)
6267
self._schema = deepcopy(DatumaroArrow.SCHEMA)
63-
self._subset = subset
6468
self._writers = []
6569
self._fnames = []
6670
self._max_chunk_size = max_chunk_size
@@ -370,6 +374,7 @@ def __init__(
370374
num_shards: int = 1,
371375
max_shard_size: Optional[int] = None,
372376
max_chunk_size: int = 1000,
377+
**kwargs,
373378
):
374379
super().__init__(
375380
extractor=extractor,

0 commit comments

Comments
 (0)