Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ $ python -m chakra.et_converter.et_converter\
--input_filename <input_filename>\
--output_filename <output_filename>\
--num_npus <num_npus>\
--num_dims <num_dims>\
--num_passes <num_passes>
```

Expand All @@ -38,17 +37,15 @@ $ python -m chakra.et_converter.et_converter\
--input_type FlexFlow\
--input_filename <input_filename>\
--output_filename <output_filename>\
--npu_frequency <npu_frequency>\
--num_dims <num_dims>
--npu_frequency <npu_frequency>
```

### PyTorch Execution Graphs
```shell
$ python -m chakra.et_converter.et_converter\
--input_type PyTorch\
--input_filename <input_filename>\
--output_filename <output_filename>\
--num_dims <num_dims>
--output_filename <output_filename>
```

## Execution Trace Generator (et_generator)
Expand All @@ -57,16 +54,15 @@ A user can define a new function in the generator to generate new synthetic exec
You can follow the commands below to run the generator.
```shell
$ python -m chakra.et_generator.et_generator\
--num_npus <num_npus>\
--num_dims <num_dims>
--num_npus <num_npus>
```

## Execution Trace Visualizer (et_visualizer)
This tool visualizes a given execution trace (ET) by converting the ET to a graph in various supported formats: PDF, Graphviz (dot), or GraphML.
The output format is determined by the file extension (postfix) of the output filename.
For PDF and Graphviz formats, use ".pdf" and ".dot" extensions respectively.
For GraphML, suitable for visualizing large-scale graphs, use the ".graphml" extension.
The PDF and Graphviz formats are generated using the Graphviz library, while the GraphML format is generated using the NetworkX library.
For GraphML, suitable for visualizing large-scale graphs, use the ".graphml" extension.
The PDF and Graphviz formats are generated using the Graphviz library, while the GraphML format is generated using the NetworkX library.
Graphviz files can be visualized with a Graphviz viewer such as https://dreampuf.github.io/GraphvizOnline/.
For visualizing GraphML files, you can use Gephi (https://gephi.org/).

Expand Down Expand Up @@ -125,8 +121,7 @@ $ ./build/astra_analytical/build.sh -c

$ cd extern/graph_frontend/chakra/
$ python -m chakra.et_generator.et_generator\
--num_npus <num_npus>\
--num_dims <num_dims>
--num_npus <num_npus>

$ cd -
$ ./run.sh
Expand Down
8 changes: 0 additions & 8 deletions et_converter/et_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,6 @@ def main() -> None:
default=None,
required=True,
help="Output Chakra execution trace filename")
parser.add_argument(
"--num_dims",
type=int,
default=None,
required=True,
help="Number of dimensions in the network topology")
parser.add_argument(
"--num_npus",
type=int,
Expand Down Expand Up @@ -83,7 +77,6 @@ def main() -> None:
converter = Text2ChakraConverter(
args.input_filename,
args.output_filename,
args.num_dims,
args.num_npus,
args.num_passes,
logger)
Expand All @@ -92,7 +85,6 @@ def main() -> None:
converter = PyTorch2ChakraConverter(
args.input_filename,
args.output_filename,
args.num_dims,
logger)
converter.convert()
else:
Expand Down
8 changes: 1 addition & 7 deletions et_converter/pytorch2chakra_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ class PyTorch2ChakraConverter:
input_filename (str): Input file name containing PyTorch execution trace.
output_filename (str): Output file name for the converted Chakra trace.
chakra_et(IO[bytes]): File handle for the Chakra execution trace output file.
num_dims (int): Number of dimensions involved in the conversion process.
logger (logging.Logger): Logger for logging information during conversion.
id_assigner (UniqueIdAssigner): Object to manage unique ID assignments.
pytorch_schema (Optional[str]): Schema info of the PyTorch trace.
Expand All @@ -114,7 +113,6 @@ def __init__(
self,
input_filename: str,
output_filename: str,
num_dims: int,
logger: logging.Logger
) -> None:
"""
Expand All @@ -124,13 +122,11 @@ def __init__(
Args:
input_filename (str): Name of the input file containing PyTorch execution trace.
output_filename (str): Name of the output file for the converted Chakra trace.
num_dims (int): Number of dimensions involved in the conversion process.
logger (logging.Logger): Logger for logging information during the conversion.
"""
self.input_filename = input_filename
self.output_filename = output_filename
self.chakra_et = None
self.num_dims = num_dims
self.logger = logger
self.id_assigner = UniqueIdAssigner()
self.initialize_attributes()
Expand Down Expand Up @@ -179,9 +175,7 @@ def convert(self) -> None:
ChakraAttr(name="comm_type",
int64_val=collective_comm_type),
ChakraAttr(name="comm_size",
int64_val=pytorch_gpu_node.comm_size),
ChakraAttr(name="involved_dim",
bool_list={"values": [True] * self.num_dims})])
int64_val=pytorch_gpu_node.comm_size)])

self.chakra_nodes[chakra_gpu_node.id] = chakra_gpu_node

Expand Down
62 changes: 0 additions & 62 deletions et_converter/text2chakra_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,12 @@ def __init__(
self,
input_filename: str,
output_filename: str,
num_dims: int,
num_npus: int,
num_passes: int,
logger: logging.Logger
) -> None:
self.input_filename = input_filename
self.output_filename = output_filename
self.num_dims = num_dims
self.num_npus = num_npus
self.num_passes = num_passes
self.logger = logger
Expand Down Expand Up @@ -187,12 +185,6 @@ def convert_microbenchmark(
bwd_wg_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_wg_comm_type,
layer.bwd_wg_comm_size)

attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
bwd_wg_comm_node.attr.append(attr)

encode_message(g, bwd_wg_comm_node)

def convert_data_parallel(
Expand Down Expand Up @@ -236,10 +228,6 @@ def convert_data_parallel(
bwd_wg_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_wg_comm_type,
layer.bwd_wg_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
bwd_wg_comm_node.attr.append(attr)

self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node)
layer.bwd_wg_comm_node = bwd_wg_comm_node
Expand Down Expand Up @@ -283,10 +271,6 @@ def convert_model_parallel(
fwd_comm_node = self.get_comm_coll_node(layer.name,
layer.fwd_comm_type,
layer.fwd_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
fwd_comm_node.attr.append(attr)
layer.fwd_comm_node = fwd_comm_node
self.add_parent(fwd_comm_node, fwd_comp_node)
encode_message(g, fwd_comm_node)
Expand All @@ -310,10 +294,6 @@ def convert_model_parallel(
bwd_ig_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_ig_comm_type,
layer.bwd_ig_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
bwd_ig_comm_node.attr.append(attr)
self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node)
layer.bwd_ig_comm_node = bwd_ig_comm_node
encode_message(g, bwd_ig_comm_node)
Expand Down Expand Up @@ -354,11 +334,6 @@ def convert_hybrid_data_model(
fwd_comm_node = self.get_comm_coll_node(layer.name,
layer.fwd_comm_type,
layer.fwd_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(True)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(False)
fwd_comm_node.attr.append(attr)
self.add_parent(fwd_comm_node, fwd_comp_node)
layer.fwd_comm_node = fwd_comm_node
encode_message(g, fwd_comm_node)
Expand All @@ -382,11 +357,6 @@ def convert_hybrid_data_model(
bwd_ig_comm_node = self.get_comm_coll_node(layer.name + "_IG_COMM_",
layer.bwd_ig_comm_type,
layer.bwd_ig_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(True)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(False)
bwd_ig_comm_node.attr.append(attr)
self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node)
layer.bwd_ig_comm_node = bwd_ig_comm_node
encode_message(g, bwd_ig_comm_node)
Expand All @@ -400,11 +370,6 @@ def convert_hybrid_data_model(
bwd_wg_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_wg_comm_type,
layer.bwd_wg_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(False)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(True)
bwd_wg_comm_node.attr.append(attr)
self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node)
layer.bwd_wg_comm_node = bwd_wg_comm_node
encode_message(g, bwd_wg_comm_node)
Expand Down Expand Up @@ -439,11 +404,6 @@ def convert_hybrid_model_data(
fwd_comm_node = self.get_comm_coll_node(layer.name,
layer.fwd_comm_type,
layer.fwd_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(False)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(True)
fwd_comm_node.attr.append(attr)
self.add_parent(fwd_comm_node, fwd_comp_node)
layer.fwd_comm_node = fwd_comm_node
encode_message(g, fwd_comm_node)
Expand All @@ -465,11 +425,6 @@ def convert_hybrid_model_data(
bwd_ig_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_ig_comm_type,
layer.bwd_ig_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(False)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(True)
bwd_ig_comm_node.attr.append(attr)
self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node)
layer.bwd_ig_comm_node = bwd_ig_comm_node
encode_message(g, bwd_ig_comm_node)
Expand All @@ -483,11 +438,6 @@ def convert_hybrid_model_data(
bwd_wg_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_wg_comm_type,
layer.bwd_wg_comm_size)
attr = ChakraAttr(name="involved_dim")
attr.bool_list.values.append(True)
for _ in range(self.num_dims - 1):
attr.bool_list.values.append(False)
bwd_wg_comm_node.attr.append(attr)
self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node)
layer.bwd_wg_comm_node = bwd_wg_comm_node
encode_message(g, bwd_wg_comm_node)
Expand Down Expand Up @@ -529,10 +479,6 @@ def convert_hybrid_dlrm(
fwd_comm_node = self.get_comm_coll_node(layer.name,
layer.fwd_comm_type,
layer.fwd_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
fwd_comm_node.attr.append(attr)
self.add_parent(fwd_comm_node, fwd_comp_node)
layer.fwd_comm_node = fwd_comm_node
encode_message(g, fwd_comm_node)
Expand All @@ -559,10 +505,6 @@ def convert_hybrid_dlrm(
bwd_wg_comm_node = self.get_comm_coll_node(layer.name,
layer.bwd_wg_comm_type,
layer.bwd_wg_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
bwd_wg_comm_node.attr.append(attr)
self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node)
layer.bwd_wg_comm_node = bwd_wg_comm_node
encode_message(g, bwd_wg_comm_node)
Expand All @@ -579,10 +521,6 @@ def convert_hybrid_dlrm(
bwd_ig_comm_node = self.get_comm_coll_node(layers[0].name,
layers[0].bwd_ig_comm_type,
layers[0].bwd_ig_comm_size)
attr = ChakraAttr(name="involved_dim")
for _ in range(self.num_dims):
attr.bool_list.values.append(True)
bwd_ig_comm_node.attr.append(attr)
if bwd_ig_comp_node is None:
raise ValueError("bwd_ig_comp_node is None")
self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node)
Expand Down
14 changes: 0 additions & 14 deletions et_feeder/et_feeder_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ ETFeederNode::ETFeederNode(std::shared_ptr<ChakraProtoMsg::Node> node) {
} else if (attr_name == "comm_type") {
this->comm_type_ =
static_cast<ChakraProtoMsg::CollectiveCommType>(attr.int64_val());
} else if (attr_name == "involved_dim") {
this->involved_dim_.clear();
for (const bool val : attr.bool_list().values()) {
this->involved_dim_.push_back(val);
}
this->involved_dim_size_ = this->involved_dim_.size();
} else if (attr_name == "comm_priority") {
this->comm_priority_ = static_cast<uint32_t>(attr.int32_val());
} else if (attr_name == "comm_size") {
Expand Down Expand Up @@ -109,14 +103,6 @@ ChakraProtoMsg::CollectiveCommType ETFeederNode::comm_type() {
return comm_type_;
}

uint32_t ETFeederNode::involved_dim_size() {
return involved_dim_size_;
}

bool ETFeederNode::involved_dim(int i) {
return involved_dim_[i];
}

uint32_t ETFeederNode::comm_priority() {
return comm_priority_;
}
Expand Down
4 changes: 0 additions & 4 deletions et_feeder/et_feeder_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ class ETFeederNode {
uint32_t tensor_loc();
uint64_t tensor_size();
ChakraProtoMsg::CollectiveCommType comm_type();
uint32_t involved_dim_size();
bool involved_dim(int i);
uint32_t comm_priority();
uint64_t comm_size();
uint32_t comm_src();
Expand All @@ -55,8 +53,6 @@ class ETFeederNode {
uint32_t tensor_loc_;
uint64_t tensor_size_;
ChakraProtoMsg::CollectiveCommType comm_type_;
uint32_t involved_dim_size_;
std::vector<bool> involved_dim_;
uint32_t comm_priority_;
uint64_t comm_size_;
uint32_t comm_src_;
Expand Down
Loading