-
Notifications
You must be signed in to change notification settings - Fork 0
Add method to generate scaling experiments to PayuManager #46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,12 @@ | |
| from pathlib import Path | ||
|
|
||
| from access.config import YAMLParser | ||
| from access.config.esm1p6_layout_input import ( | ||
| LayoutSearchConfig, | ||
| LayoutTuple, | ||
| generate_esm1p6_core_layouts_from_node_count, | ||
| generate_esm1p6_perturb_block, | ||
| ) | ||
|
|
||
| from access.profiling.cice5_parser import CICE5ProfilingParser | ||
| from access.profiling.cylc_manager import CylcRoseManager | ||
|
|
@@ -19,6 +25,10 @@ | |
| class ESM16Profiling(PayuManager): | ||
| """Handles profiling of ACCESS-ESM1.6 configurations.""" | ||
|
|
||
| @property | ||
| def model_type(self) -> str: | ||
| return "access-esm1.6" | ||
|
|
||
| def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]: | ||
| """Returns available profiling logs for the components in ACCESS-ESM1.6. | ||
|
|
||
|
|
@@ -52,6 +62,16 @@ def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]: | |
|
|
||
| return logs | ||
|
|
||
| def generate_core_layouts_from_node_count( | ||
| self, num_nodes: float, cores_per_node: int, layout_search_config: LayoutSearchConfig | None = None | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to read this in two parts:
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! I still don't follow why the Regardless, this is just for my curiosity :)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scratch that, type-hinting does not actually do runtime checks - so that
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, sorry for not clarifying that part. |
||
| ) -> list: | ||
| return generate_esm1p6_core_layouts_from_node_count( | ||
| num_nodes, cores_per_node, layout_search_config=layout_search_config | ||
| ) | ||
|
|
||
| def generate_perturbation_block(self, layout: LayoutTuple, branch_name_prefix: str) -> dict: | ||
| return generate_esm1p6_perturb_block(layout, branch_name_prefix) | ||
|
|
||
|
|
||
| class RAM3Profiling(CylcRoseManager): | ||
| """Handles profiling of ACCESS-rAM3 configurations.""" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,9 +3,14 @@ | |
|
|
||
| import logging | ||
| from abc import ABC, abstractmethod | ||
| from collections.abc import Callable | ||
| from datetime import timedelta | ||
| from pathlib import Path | ||
|
|
||
| from access.config import YAMLParser | ||
| from access.config.esm1p6_layout_input import LayoutSearchConfig | ||
| from access.config.layout_config import LayoutTuple | ||
| from experiment_generator.experiment_generator import ExperimentGenerator | ||
| from experiment_runner.experiment_runner import ExperimentRunner | ||
|
|
||
| from access.profiling.experiment import ProfilingLog | ||
|
|
@@ -32,6 +37,37 @@ def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]: | |
| dict[str, ProfilingLog]: Dictionary mapping component names to their ProfilingLog instances. | ||
| """ | ||
|
|
||
| @property | ||
| @abstractmethod | ||
| def model_type(self) -> str: | ||
| """Returns the model type identifier, as defined in Payu.""" | ||
|
|
||
| @abstractmethod | ||
| def generate_core_layouts_from_node_count( | ||
| self, | ||
| num_nodes: float, | ||
| cores_per_node: int, | ||
| layout_search_config: LayoutSearchConfig | None = None, | ||
| ) -> list: | ||
| """Generates core layouts from the given number of nodes. | ||
|
|
||
| Args: | ||
| num_nodes (float): Number of nodes. | ||
| cores_per_node (int): Number of cores per node. | ||
| layout_search_config (LayoutSearchConfig | None): Configuration for layout search. | ||
| """ | ||
|
|
||
| @abstractmethod | ||
| def generate_perturbation_block(self, layout: LayoutTuple, branch_name_prefix: str) -> dict: | ||
| """Generates a perturbation block for the given layout to be passed to the experiment generator. | ||
|
|
||
| Args: | ||
| layout (LayoutTuple): Core layout tuple. | ||
| branch_name_prefix (str): Branch name prefix. | ||
| Returns: | ||
| dict: Perturbation block configuration. | ||
| """ | ||
|
|
||
| @property | ||
| def nruns(self) -> int: | ||
| """Returns the number of repetitions for the Payu experiments. | ||
|
|
@@ -70,19 +106,81 @@ def startfrom_restart(self, value: str) -> None: | |
| """ | ||
| self._startfrom_restart = value | ||
|
|
||
| def generate_experiments(self, branches: list[str]) -> None: | ||
| """Generates Payu experiments for profiling data generation. | ||
| def set_control(self, repository, commit) -> None: | ||
| """Sets the control experiment from an existing Payu configuration. | ||
|
|
||
| Args: | ||
| branches (list[str]): List of branches to generate experiments for. | ||
| repository: Git repository URL or path. | ||
| commit: Git commit hash or identifier. | ||
| """ | ||
| self._repository = repository | ||
| self._control_commit = commit | ||
|
|
||
| for branch in branches: | ||
| if branch in self.experiments: | ||
| logger.info(f"Experiment for branch {branch} already exists. Skipping addition.") | ||
| else: | ||
| def generate_scaling_experiments( | ||
| self, | ||
| num_nodes_list: list[float], | ||
| control_options: dict, | ||
| cores_per_node: int, | ||
| tol_around_ctrl_ratio: float, | ||
| max_wasted_ncores_frac: float | Callable[[float], float], | ||
| walltime: float | Callable[[float], float], | ||
| ) -> None: | ||
| """Generates scaling experiments using the ExperimentGenerator. | ||
|
|
||
| Args: | ||
| num_nodes_list (list[int]): List of number of nodes to generate experiments for. | ||
| control_options (dict): Options for the control experiment. | ||
| cores_per_node (int): Number of cores per node. | ||
| tol_around_ctrl_ratio (float): Tolerance around control core ratio for layout generation. | ||
| max_wasted_ncores_frac (float | Callable[[float], float]): Maximum fraction of wasted cores allowed. | ||
| walltime (float | Callable[[float], float]): Walltime in hours for each experiment. | ||
| """ | ||
|
|
||
| generator_config = { | ||
| "model_type": self.model_type, | ||
| "repository_url": self._repository, | ||
| "start_point": self._control_commit, | ||
manodeep marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "test_path": str(self.work_dir), | ||
| "repository_directory": self._repository_directory, | ||
| "control_branch_name": "ctrl", | ||
| "Control_Experiment": control_options, | ||
| } | ||
|
|
||
| seen_layouts = set() | ||
| seqnum = 1 | ||
| generator_config["Perturbation_Experiment"] = {} | ||
| for num_nodes in num_nodes_list: | ||
| mwf = max_wasted_ncores_frac(num_nodes) if callable(max_wasted_ncores_frac) else max_wasted_ncores_frac | ||
| layout_config = LayoutSearchConfig(tol_around_ctrl_ratio=tol_around_ctrl_ratio, max_wasted_ncores_frac=mwf) | ||
| layouts = self.generate_core_layouts_from_node_count( | ||
| num_nodes, | ||
| cores_per_node=cores_per_node, | ||
| layout_search_config=layout_config, | ||
| ) | ||
| if not layouts: | ||
| logger.warning(f"No layouts found for {num_nodes} nodes") | ||
| continue | ||
|
|
||
| layouts = [x for x in layouts if x not in seen_layouts] | ||
| seen_layouts.update(layouts) | ||
micaeljtoliveira marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| logger.info(f"Generated {len(layouts)} layouts for {num_nodes} nodes. Layouts: {layouts}") | ||
|
|
||
| # TODO: the branch name needs to be simpler and model agnostic | ||
| branch_name = f"layout-unused-cores-to-cice-{layout_config.allocate_unused_cores_to_ice}" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This reminded me - that feature is broken :(
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for creating an issue about it! |
||
| walltime_hrs = walltime(num_nodes) if callable(walltime) else walltime | ||
|
|
||
| for layout in layouts: | ||
| pert_config = self.generate_perturbation_block(layout=layout, branch_name_prefix=branch_name) | ||
| branch = pert_config["branches"][0] | ||
| pert_config["config.yaml"]["walltime"] = str(timedelta(hours=walltime_hrs)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does the walltime need to be set explicitly? (I remember there was something odd about the walltime in experiment-generator - but may be just add a comment that clarifies the need for walltime setting)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The walltime is a value set in the payu config (config.yaml), so here we are letting the user set it differently for each perturbation by providing a function. You had a similar logic in the ESM1.6 scaling notebook, but were directly replacing the text of the yaml block: walltime_hrs += nblocks_added * (1.5 * 4.0/num_nodes) # use a 1.5 hrs time for 4-node runs, and then scale linearly
entire_block += block
entire_block = entire_block.replace("walltime: Inf", f"walltime: {int(walltime_hrs)}:00:00")
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be clear, one could just pass it as part of the
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure why janky walltime increasing logic with the number of scaling experiments actually worked 😅 - might have had to edit by hand afterwards
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hopefully the new implementation will be less janky 😄 |
||
|
|
||
| generator_config["Perturbation_Experiment"][f"Experiment_{seqnum}"] = pert_config | ||
| self.experiments[branch] = ProfilingExperiment(self.work_dir / branch / self._repository_directory) | ||
|
|
||
| seqnum += 1 | ||
|
|
||
| ExperimentGenerator(generator_config).run() | ||
manodeep marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def run_experiments(self) -> None: | ||
| """Runs Payu experiments for profiling data generation.""" | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.