Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"matplotlib",
"access-config-utils",
"experiment-runner",
"experiment-generator",
"payu", # Extra dependency of experiment-runner, not handled correctly in conda
]

Expand Down
20 changes: 20 additions & 0 deletions src/access/profiling/access_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from pathlib import Path

from access.config import YAMLParser
from access.config.esm1p6_layout_input import (
LayoutSearchConfig,
LayoutTuple,
generate_esm1p6_core_layouts_from_node_count,
generate_esm1p6_perturb_block,
)

from access.profiling.cice5_parser import CICE5ProfilingParser
from access.profiling.cylc_manager import CylcRoseManager
Expand All @@ -19,6 +25,10 @@
class ESM16Profiling(PayuManager):
"""Handles profiling of ACCESS-ESM1.6 configurations."""

@property
def model_type(self) -> str:
return "access-esm1.6"

def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]:
"""Returns available profiling logs for the components in ACCESS-ESM1.6.

Expand Down Expand Up @@ -52,6 +62,16 @@ def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]:

return logs

def generate_core_layouts_from_node_count(
self, num_nodes: float, cores_per_node: int, layout_search_config: LayoutSearchConfig | None = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know the | None = None - does that mean allow "None" as an input, and then pass that "None" to the routine? (Basically, trying to figure out what the = None bit is doing)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to read this in two parts:
layout_search_config: LayoutSearchConfig | None: parameter is of type LayoutSearchConfing or None.
layout_search_config = None: the usual way of providing a default value.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I still don't follow why the =None is necessary. Either the parameter is passed as an instance of LayoutSearchConfig or it is passed as None

Regardless, this is just for my curiosity :)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scratch that, type-hinting does not actually do runtime checks - so that =None would be required

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, sorry for not clarifying that part.

) -> list:
return generate_esm1p6_core_layouts_from_node_count(
num_nodes, cores_per_node, layout_search_config=layout_search_config
)

def generate_perturbation_block(self, layout: LayoutTuple, branch_name_prefix: str) -> dict:
return generate_esm1p6_perturb_block(layout, branch_name_prefix)


class RAM3Profiling(CylcRoseManager):
"""Handles profiling of ACCESS-rAM3 configurations."""
Expand Down
112 changes: 105 additions & 7 deletions src/access/profiling/payu_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@

import logging
from abc import ABC, abstractmethod
from collections.abc import Callable
from datetime import timedelta
from pathlib import Path

from access.config import YAMLParser
from access.config.esm1p6_layout_input import LayoutSearchConfig
from access.config.layout_config import LayoutTuple
from experiment_generator.experiment_generator import ExperimentGenerator
from experiment_runner.experiment_runner import ExperimentRunner

from access.profiling.experiment import ProfilingLog
Expand All @@ -32,6 +37,37 @@ def get_component_logs(self, path: Path) -> dict[str, ProfilingLog]:
dict[str, ProfilingLog]: Dictionary mapping component names to their ProfilingLog instances.
"""

@property
@abstractmethod
def model_type(self) -> str:
"""Returns the model type identifier, as defined in Payu."""

@abstractmethod
def generate_core_layouts_from_node_count(
self,
num_nodes: float,
cores_per_node: int,
layout_search_config: LayoutSearchConfig | None = None,
) -> list:
"""Generates core layouts from the given number of nodes.

Args:
num_nodes (float): Number of nodes.
cores_per_node (int): Number of cores per node.
layout_search_config (LayoutSearchConfig | None): Configuration for layout search.
"""

@abstractmethod
def generate_perturbation_block(self, layout: LayoutTuple, branch_name_prefix: str) -> dict:
"""Generates a perturbation block for the given layout to be passed to the experiment generator.

Args:
layout (LayoutTuple): Core layout tuple.
branch_name_prefix (str): Branch name prefix.
Returns:
dict: Perturbation block configuration.
"""

@property
def nruns(self) -> int:
"""Returns the number of repetitions for the Payu experiments.
Expand Down Expand Up @@ -70,19 +106,81 @@ def startfrom_restart(self, value: str) -> None:
"""
self._startfrom_restart = value

def generate_experiments(self, branches: list[str]) -> None:
"""Generates Payu experiments for profiling data generation.
def set_control(self, repository, commit) -> None:
"""Sets the control experiment from an existing Payu configuration.

Args:
branches (list[str]): List of branches to generate experiments for.
repository: Git repository URL or path.
commit: Git commit hash or identifier.
"""
self._repository = repository
self._control_commit = commit

for branch in branches:
if branch in self.experiments:
logger.info(f"Experiment for branch {branch} already exists. Skipping addition.")
else:
def generate_scaling_experiments(
self,
num_nodes_list: list[float],
control_options: dict,
cores_per_node: int,
tol_around_ctrl_ratio: float,
max_wasted_ncores_frac: float | Callable[[float], float],
walltime: float | Callable[[float], float],
) -> None:
"""Generates scaling experiments using the ExperimentGenerator.

Args:
num_nodes_list (list[int]): List of number of nodes to generate experiments for.
control_options (dict): Options for the control experiment.
cores_per_node (int): Number of cores per node.
tol_around_ctrl_ratio (float): Tolerance around control core ratio for layout generation.
max_wasted_ncores_frac (float | Callable[[float], float]): Maximum fraction of wasted cores allowed.
walltime (float | Callable[[float], float]): Walltime in hours for each experiment.
"""

generator_config = {
"model_type": self.model_type,
"repository_url": self._repository,
"start_point": self._control_commit,
"test_path": str(self.work_dir),
"repository_directory": self._repository_directory,
"control_branch_name": "ctrl",
"Control_Experiment": control_options,
}

seen_layouts = set()
seqnum = 1
generator_config["Perturbation_Experiment"] = {}
for num_nodes in num_nodes_list:
mwf = max_wasted_ncores_frac(num_nodes) if callable(max_wasted_ncores_frac) else max_wasted_ncores_frac
layout_config = LayoutSearchConfig(tol_around_ctrl_ratio=tol_around_ctrl_ratio, max_wasted_ncores_frac=mwf)
layouts = self.generate_core_layouts_from_node_count(
num_nodes,
cores_per_node=cores_per_node,
layout_search_config=layout_config,
)
if not layouts:
logger.warning(f"No layouts found for {num_nodes} nodes")
continue

layouts = [x for x in layouts if x not in seen_layouts]
seen_layouts.update(layouts)
logger.info(f"Generated {len(layouts)} layouts for {num_nodes} nodes. Layouts: {layouts}")

# TODO: the branch name needs to be simpler and model agnostic
branch_name = f"layout-unused-cores-to-cice-{layout_config.allocate_unused_cores_to_ice}"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reminded me - that feature is broken :(

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for creating an issue about it!

walltime_hrs = walltime(num_nodes) if callable(walltime) else walltime

for layout in layouts:
pert_config = self.generate_perturbation_block(layout=layout, branch_name_prefix=branch_name)
branch = pert_config["branches"][0]
pert_config["config.yaml"]["walltime"] = str(timedelta(hours=walltime_hrs))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does the walltime need to be set explicitly?

(I remember there was something odd about the walltime in experiment-generator - but may be just add a comment that clarifies the need for walltime setting)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The walltime is a value set in the payu config (config.yaml), so here we are letting the user set it differently for each perturbation by providing a function.

You had a similar logic in the ESM1.6 scaling notebook, but were directly replacing the text of the yaml block:

        walltime_hrs += nblocks_added * (1.5 * 4.0/num_nodes) # use a 1.5 hrs time for 4-node runs, and then scale linearly
        entire_block += block

    entire_block = entire_block.replace("walltime: Inf", f"walltime: {int(walltime_hrs)}:00:00")

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, one could just pass it as part of the control_options, but then it would always be the same for all perturbations.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure why janky walltime increasing logic with the number of scaling experiments actually worked 😅 - might have had to edit by hand afterwards

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hopefully the new implementation will be less janky 😄


generator_config["Perturbation_Experiment"][f"Experiment_{seqnum}"] = pert_config
self.experiments[branch] = ProfilingExperiment(self.work_dir / branch / self._repository_directory)

seqnum += 1

ExperimentGenerator(generator_config).run()

def run_experiments(self) -> None:
"""Runs Payu experiments for profiling data generation."""

Expand Down
13 changes: 13 additions & 0 deletions tests/test_access_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest import mock

from access.config import YAMLParser
from access.config.esm1p6_layout_input import LayoutSearchConfig, LayoutTuple

from access.profiling.access_models import ESM16Profiling, RAM3Profiling
from access.profiling.cice5_parser import CICE5ProfilingParser
Expand Down Expand Up @@ -52,6 +53,18 @@ def test_esm16_config_profiling(mock_is_file, mock_yaml_parse, mock_path_read_te
assert "MOM5" in logs
assert "CICE5" not in logs

assert config_profiling.model_type == "access-esm1.6"

with mock.patch("access.profiling.access_models.generate_esm1p6_core_layouts_from_node_count") as mock_generate:
layout_mock = mock.MagicMock(spec=LayoutSearchConfig)
config_profiling.generate_core_layouts_from_node_count(16, 32, layout_search_config=layout_mock)
mock_generate.assert_called_once_with(16, 32, layout_search_config=layout_mock)

with mock.patch("access.profiling.access_models.generate_esm1p6_perturb_block") as mock_generate:
layout_mock = mock.MagicMock(spec=LayoutTuple)
config_profiling.generate_perturbation_block(layout_mock, "branch_name_prefix")
mock_generate.assert_called_once_with(layout_mock, "branch_name_prefix")


def test_ram3_config_profiling():
"""Test the rAM3Profiling class."""
Expand Down
Loading
Loading