Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .github/workflows/perf-accuracy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ on:
type: choice
description: Model type to run benchmark
options:
- default # speed, balance, accuracy models only
- all # default + other models
- default # speed, balance, accuracy models only
- all # default + other models
default: default
data-size:
type: choice
description: Dataset size to run benchmark
options:
- small
- medium
- large
- all
- small
- medium
- large
- all
default: all
num-repeat:
description: Overrides default per-data-size number of repeat setting
Expand All @@ -29,9 +29,9 @@ on:
type: choice
description: The last operation to evaluate. 'optimize' means all.
options:
- train
- export
- optimize
- train
- export
- optimize
default: optimize

jobs:
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/perf-speed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ on:
type: choice
description: Model type to run benchmark
options:
- default # speed, balance, accuracy models only
- all # default + other models
- default # speed, balance, accuracy models only
- all # default + other models
default: default
data-size:
type: choice
description: Dataset size to run benchmark
options:
- small
- medium
- large
- all
- small
- medium
- large
- all
default: large
num-repeat:
description: Overrides default per-data-size number of repeat setting
Expand All @@ -29,9 +29,9 @@ on:
type: choice
description: The last operation to evaluate. 'optimize' means all.
options:
- train
- export
- optimize
- train
- export
- optimize
default: optimize

jobs:
Expand Down
10 changes: 7 additions & 3 deletions src/otx/cli/tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,15 +286,19 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b

resource_tracker = None
if args.track_resource_usage and not is_multigpu_child_process():
resource_tracker = ResourceTracker(args.track_resource_usage, args.gpus)
resource_tracker = ResourceTracker(
config_manager.output_path / "resource_usage.yaml", args.track_resource_usage, args.gpus
)
resource_tracker.start()
if exit_stack is not None:
exit_stack.callback(resource_tracker.stop)

task.train(
dataset, output_model, train_parameters=TrainParameters(), seed=args.seed, deterministic=args.deterministic
)

if resource_tracker is not None:
resource_tracker.stop(config_manager.output_path / "resource_usage.yaml")
if resource_tracker is not None and exit_stack is None:
resource_tracker.stop()

model_path = config_manager.output_path / "models"
save_model_data(output_model, str(model_path))
Expand Down
19 changes: 8 additions & 11 deletions src/otx/cli/utils/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@ class ResourceTracker:
"""Class to track resources usage.

Args:
output_path (Union[str, Path]): Output file path to save CPU & GPU utilization and max meory usage values.
resource_type (str, optional): Which resource to track. Available values are cpu, gpu or all now.
Defaults to "all".
gpu_ids (Optional[str]): GPU indices to record.
"""

def __init__(self, resource_type: str = "all", gpu_ids: Optional[str] = None):
def __init__(self, output_path: Union[str, Path], resource_type: str = "all", gpu_ids: Optional[str] = None):
if isinstance(output_path, str):
output_path = Path(output_path)
self.output_path = output_path
if resource_type == "all":
self._resource_type = AVAILABLE_RESOURCE_TYPE
else:
Expand All @@ -62,19 +66,12 @@ def start(self):
)
self._mem_check_proc.start()

def stop(self, output_path: Union[str, Path]):
"""Terminate a process to record resources usage.

Args:
output_path (Union[str, Path]): Output file path to save CPU & GPU utilization and max meory usage values.
"""
def stop(self):
"""Terminate a process to record resources usage."""
if self._mem_check_proc is None or not self._mem_check_proc.is_alive():
return

if isinstance(output_path, str):
output_path = Path(output_path)

self._queue.put(output_path)
self._queue.put(self.output_path)
self._mem_check_proc.join(10)
if self._mem_check_proc.exitcode is None:
self._mem_check_proc.terminate()
Expand Down
21 changes: 11 additions & 10 deletions tests/unit/cli/utils/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def _set_up(self, mocker):
@e2e_pytest_unit
@pytest.mark.parametrize("resource_type", ("cpu", "gpu", "all", "cpu,gpu"))
@pytest.mark.parametrize("gpu_ids", (None, "0", "0,3"))
def test_init(self, resource_type, gpu_ids):
ResourceTracker(resource_type, gpu_ids)
@pytest.mark.parametrize("output_path", ("fake", Path("fake")))
def test_init(self, output_path, resource_type, gpu_ids):
ResourceTracker(output_path, resource_type, gpu_ids)

@e2e_pytest_unit
@pytest.mark.parametrize("resource_type", ("cpu", "gpu", "all", "cpu,gpu"))
Expand All @@ -41,7 +42,7 @@ def test_start(self, resource_type, gpu_ids):
expected_gpu_ids[0] = 0

# run
resource_tracker = ResourceTracker(resource_type, gpu_ids)
resource_tracker = ResourceTracker("fake_output", resource_type, gpu_ids)
resource_tracker.start()

self.mock_proc.start.assert_called_once() # check that a process to track resource usages starts
Expand All @@ -51,7 +52,7 @@ def test_start(self, resource_type, gpu_ids):

@e2e_pytest_unit
def test_start_multiple_times(self):
resource_tracker = ResourceTracker()
resource_tracker = ResourceTracker("fake_output")

# run multiple times
resource_tracker.start()
Expand All @@ -63,9 +64,9 @@ def test_start_multiple_times(self):
def test_stop(self):
output_path = Path("fake")

resource_tracker = ResourceTracker()
resource_tracker = ResourceTracker(output_path)
resource_tracker.start()
resource_tracker.stop(output_path)
resource_tracker.stop()

# check that code to terminate a process is executed properly
self.mock_queue.put.assert_called_once_with(output_path)
Expand All @@ -77,9 +78,9 @@ def test_stop_not_exit_normally(self):
output_path = Path("fake")
self.mock_proc.exitcode = None

resource_tracker = ResourceTracker()
resource_tracker = ResourceTracker(output_path)
resource_tracker.start()
resource_tracker.stop(output_path)
resource_tracker.stop()

# check that code to terminate a process is executed properly
self.mock_queue.put.assert_called_once_with(output_path)
Expand All @@ -90,8 +91,8 @@ def test_stop_not_exit_normally(self):

@e2e_pytest_unit
def test_stop_before_start(self):
resource_tracker = ResourceTracker()
resource_tracker.stop("fake")
resource_tracker = ResourceTracker("fake")
resource_tracker.stop()

# check that code to make a process done isn't called
self.mock_queue.put.assert_not_called()
Expand Down