diff --git a/INSTALL.md b/INSTALL.md index 4c74ca00..f0f3d51b 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -28,7 +28,6 @@ $ python -m chakra.et_converter.et_converter\ --input_filename \ --output_filename \ --num_npus \ - --num_dims \ --num_passes ``` @@ -38,8 +37,7 @@ $ python -m chakra.et_converter.et_converter\ --input_type FlexFlow\ --input_filename \ --output_filename \ - --npu_frequency \ - --num_dims + --npu_frequency ``` ### PyTorch Execution Graphs @@ -47,8 +45,7 @@ $ python -m chakra.et_converter.et_converter\ $ python -m chakra.et_converter.et_converter\ --input_type PyTorch\ --input_filename \ - --output_filename \ - --num_dims + --output_filename ``` ## Execution Trace Generator (et_generator) @@ -57,16 +54,15 @@ A user can define a new function in the generator to generate new synthetic exec You can follow the commands below to run the generator. ```shell $ python -m chakra.et_generator.et_generator\ - --num_npus \ - --num_dims + --num_npus ``` ## Execution Trace Visualizer (et_visualizer) This tool visualizes a given execution trace (ET) by converting the ET to a graph in various supported formats: PDF, Graphviz (dot), or GraphML. The output format is determined by the file extension (postfix) of the output filename. For PDF and Graphviz formats, use ".pdf" and ".dot" extensions respectively. -For GraphML, suitable for visualizing large-scale graphs, use the ".graphml" extension. -The PDF and Graphviz formats are generated using the Graphviz library, while the GraphML format is generated using the NetworkX library. +For GraphML, suitable for visualizing large-scale graphs, use the ".graphml" extension. +The PDF and Graphviz formats are generated using the Graphviz library, while the GraphML format is generated using the NetworkX library. Graphviz files can be visualized with a Graphviz viewer such as https://dreampuf.github.io/GraphvizOnline/. For visualizing GraphML files, you can use Gephi (https://gephi.org/). @@ -125,8 +121,7 @@ $ ./build/astra_analytical/build.sh -c $ cd extern/graph_frontend/chakra/ $ python -m chakra.et_generator.et_generator\ - --num_npus \ - --num_dims + --num_npus $ cd - $ ./run.sh diff --git a/et_converter/et_converter.py b/et_converter/et_converter.py index c1748109..7c8ead39 100644 --- a/et_converter/et_converter.py +++ b/et_converter/et_converter.py @@ -50,12 +50,6 @@ def main() -> None: default=None, required=True, help="Output Chakra execution trace filename") - parser.add_argument( - "--num_dims", - type=int, - default=None, - required=True, - help="Number of dimensions in the network topology") parser.add_argument( "--num_npus", type=int, @@ -83,7 +77,6 @@ def main() -> None: converter = Text2ChakraConverter( args.input_filename, args.output_filename, - args.num_dims, args.num_npus, args.num_passes, logger) @@ -92,7 +85,6 @@ def main() -> None: converter = PyTorch2ChakraConverter( args.input_filename, args.output_filename, - args.num_dims, logger) converter.convert() else: diff --git a/et_converter/pytorch2chakra_converter.py b/et_converter/pytorch2chakra_converter.py index 43a80730..c21886e1 100644 --- a/et_converter/pytorch2chakra_converter.py +++ b/et_converter/pytorch2chakra_converter.py @@ -90,7 +90,6 @@ class PyTorch2ChakraConverter: input_filename (str): Input file name containing PyTorch execution trace. output_filename (str): Output file name for the converted Chakra trace. chakra_et(IO[bytes]): File handle for the Chakra execution trace output file. - num_dims (int): Number of dimensions involved in the conversion process. logger (logging.Logger): Logger for logging information during conversion. id_assigner (UniqueIdAssigner): Object to manage unique ID assignments. pytorch_schema (Optional[str]): Schema info of the PyTorch trace. @@ -114,7 +113,6 @@ def __init__( self, input_filename: str, output_filename: str, - num_dims: int, logger: logging.Logger ) -> None: """ @@ -124,13 +122,11 @@ def __init__( Args: input_filename (str): Name of the input file containing PyTorch execution trace. output_filename (str): Name of the output file for the converted Chakra trace. - num_dims (int): Number of dimensions involved in the conversion process. logger (logging.Logger): Logger for logging information during the conversion. """ self.input_filename = input_filename self.output_filename = output_filename self.chakra_et = None - self.num_dims = num_dims self.logger = logger self.id_assigner = UniqueIdAssigner() self.initialize_attributes() @@ -179,9 +175,7 @@ def convert(self) -> None: ChakraAttr(name="comm_type", int64_val=collective_comm_type), ChakraAttr(name="comm_size", - int64_val=pytorch_gpu_node.comm_size), - ChakraAttr(name="involved_dim", - bool_list={"values": [True] * self.num_dims})]) + int64_val=pytorch_gpu_node.comm_size)]) self.chakra_nodes[chakra_gpu_node.id] = chakra_gpu_node diff --git a/et_converter/text2chakra_converter.py b/et_converter/text2chakra_converter.py index 892e95b6..3d38547a 100644 --- a/et_converter/text2chakra_converter.py +++ b/et_converter/text2chakra_converter.py @@ -56,14 +56,12 @@ def __init__( self, input_filename: str, output_filename: str, - num_dims: int, num_npus: int, num_passes: int, logger: logging.Logger ) -> None: self.input_filename = input_filename self.output_filename = output_filename - self.num_dims = num_dims self.num_npus = num_npus self.num_passes = num_passes self.logger = logger @@ -187,12 +185,6 @@ def convert_microbenchmark( bwd_wg_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_wg_comm_type, layer.bwd_wg_comm_size) - - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - bwd_wg_comm_node.attr.append(attr) - encode_message(g, bwd_wg_comm_node) def convert_data_parallel( @@ -236,10 +228,6 @@ def convert_data_parallel( bwd_wg_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_wg_comm_type, layer.bwd_wg_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - bwd_wg_comm_node.attr.append(attr) self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node) layer.bwd_wg_comm_node = bwd_wg_comm_node @@ -283,10 +271,6 @@ def convert_model_parallel( fwd_comm_node = self.get_comm_coll_node(layer.name, layer.fwd_comm_type, layer.fwd_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - fwd_comm_node.attr.append(attr) layer.fwd_comm_node = fwd_comm_node self.add_parent(fwd_comm_node, fwd_comp_node) encode_message(g, fwd_comm_node) @@ -310,10 +294,6 @@ def convert_model_parallel( bwd_ig_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_ig_comm_type, layer.bwd_ig_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - bwd_ig_comm_node.attr.append(attr) self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node) layer.bwd_ig_comm_node = bwd_ig_comm_node encode_message(g, bwd_ig_comm_node) @@ -354,11 +334,6 @@ def convert_hybrid_data_model( fwd_comm_node = self.get_comm_coll_node(layer.name, layer.fwd_comm_type, layer.fwd_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(True) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(False) - fwd_comm_node.attr.append(attr) self.add_parent(fwd_comm_node, fwd_comp_node) layer.fwd_comm_node = fwd_comm_node encode_message(g, fwd_comm_node) @@ -382,11 +357,6 @@ def convert_hybrid_data_model( bwd_ig_comm_node = self.get_comm_coll_node(layer.name + "_IG_COMM_", layer.bwd_ig_comm_type, layer.bwd_ig_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(True) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(False) - bwd_ig_comm_node.attr.append(attr) self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node) layer.bwd_ig_comm_node = bwd_ig_comm_node encode_message(g, bwd_ig_comm_node) @@ -400,11 +370,6 @@ def convert_hybrid_data_model( bwd_wg_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_wg_comm_type, layer.bwd_wg_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(False) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(True) - bwd_wg_comm_node.attr.append(attr) self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node) layer.bwd_wg_comm_node = bwd_wg_comm_node encode_message(g, bwd_wg_comm_node) @@ -439,11 +404,6 @@ def convert_hybrid_model_data( fwd_comm_node = self.get_comm_coll_node(layer.name, layer.fwd_comm_type, layer.fwd_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(False) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(True) - fwd_comm_node.attr.append(attr) self.add_parent(fwd_comm_node, fwd_comp_node) layer.fwd_comm_node = fwd_comm_node encode_message(g, fwd_comm_node) @@ -465,11 +425,6 @@ def convert_hybrid_model_data( bwd_ig_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_ig_comm_type, layer.bwd_ig_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(False) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(True) - bwd_ig_comm_node.attr.append(attr) self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node) layer.bwd_ig_comm_node = bwd_ig_comm_node encode_message(g, bwd_ig_comm_node) @@ -483,11 +438,6 @@ def convert_hybrid_model_data( bwd_wg_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_wg_comm_type, layer.bwd_wg_comm_size) - attr = ChakraAttr(name="involved_dim") - attr.bool_list.values.append(True) - for _ in range(self.num_dims - 1): - attr.bool_list.values.append(False) - bwd_wg_comm_node.attr.append(attr) self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node) layer.bwd_wg_comm_node = bwd_wg_comm_node encode_message(g, bwd_wg_comm_node) @@ -529,10 +479,6 @@ def convert_hybrid_dlrm( fwd_comm_node = self.get_comm_coll_node(layer.name, layer.fwd_comm_type, layer.fwd_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - fwd_comm_node.attr.append(attr) self.add_parent(fwd_comm_node, fwd_comp_node) layer.fwd_comm_node = fwd_comm_node encode_message(g, fwd_comm_node) @@ -559,10 +505,6 @@ def convert_hybrid_dlrm( bwd_wg_comm_node = self.get_comm_coll_node(layer.name, layer.bwd_wg_comm_type, layer.bwd_wg_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - bwd_wg_comm_node.attr.append(attr) self.add_parent(bwd_wg_comm_node, bwd_wg_comp_node) layer.bwd_wg_comm_node = bwd_wg_comm_node encode_message(g, bwd_wg_comm_node) @@ -579,10 +521,6 @@ def convert_hybrid_dlrm( bwd_ig_comm_node = self.get_comm_coll_node(layers[0].name, layers[0].bwd_ig_comm_type, layers[0].bwd_ig_comm_size) - attr = ChakraAttr(name="involved_dim") - for _ in range(self.num_dims): - attr.bool_list.values.append(True) - bwd_ig_comm_node.attr.append(attr) if bwd_ig_comp_node is None: raise ValueError("bwd_ig_comp_node is None") self.add_parent(bwd_ig_comm_node, bwd_ig_comp_node) diff --git a/et_feeder/et_feeder_node.cpp b/et_feeder/et_feeder_node.cpp index aa7b6d9b..547ef6fd 100644 --- a/et_feeder/et_feeder_node.cpp +++ b/et_feeder/et_feeder_node.cpp @@ -22,12 +22,6 @@ ETFeederNode::ETFeederNode(std::shared_ptr node) { } else if (attr_name == "comm_type") { this->comm_type_ = static_cast(attr.int64_val()); - } else if (attr_name == "involved_dim") { - this->involved_dim_.clear(); - for (const bool val : attr.bool_list().values()) { - this->involved_dim_.push_back(val); - } - this->involved_dim_size_ = this->involved_dim_.size(); } else if (attr_name == "comm_priority") { this->comm_priority_ = static_cast(attr.int32_val()); } else if (attr_name == "comm_size") { @@ -109,14 +103,6 @@ ChakraProtoMsg::CollectiveCommType ETFeederNode::comm_type() { return comm_type_; } -uint32_t ETFeederNode::involved_dim_size() { - return involved_dim_size_; -} - -bool ETFeederNode::involved_dim(int i) { - return involved_dim_[i]; -} - uint32_t ETFeederNode::comm_priority() { return comm_priority_; } diff --git a/et_feeder/et_feeder_node.h b/et_feeder/et_feeder_node.h index 3e738eff..7d688e4f 100644 --- a/et_feeder/et_feeder_node.h +++ b/et_feeder/et_feeder_node.h @@ -28,8 +28,6 @@ class ETFeederNode { uint32_t tensor_loc(); uint64_t tensor_size(); ChakraProtoMsg::CollectiveCommType comm_type(); - uint32_t involved_dim_size(); - bool involved_dim(int i); uint32_t comm_priority(); uint64_t comm_size(); uint32_t comm_src(); @@ -55,8 +53,6 @@ class ETFeederNode { uint32_t tensor_loc_; uint64_t tensor_size_; ChakraProtoMsg::CollectiveCommType comm_type_; - uint32_t involved_dim_size_; - std::vector involved_dim_; uint32_t comm_priority_; uint64_t comm_size_; uint32_t comm_src_; diff --git a/utils/et_generator/et_generator.py b/utils/et_generator/et_generator.py index 8f22dbfc..61f893c4 100644 --- a/utils/et_generator/et_generator.py +++ b/utils/et_generator/et_generator.py @@ -50,10 +50,6 @@ def get_comm_type_attr(comm_type: int) -> ChakraAttr: return ChakraAttr(name="comm_type", int64_val=comm_type) -def get_involved_dim_attr(num_dims: int) -> ChakraAttr: - return ChakraAttr(name="involved_dim", bool_list=BoolList(values=[True] * num_dims)) - - def one_metadata_node_all_types(num_npus: int) -> None: for npu_id in range(num_npus): output_filename = f"one_metadata_node_all_types.{npu_id}.et" @@ -196,7 +192,7 @@ def two_comp_nodes_dependent(num_npus: int, runtime: int) -> None: encode_message(et, child_node) -def one_comm_coll_node_allreduce(num_npus: int, num_dims: int, comm_size: int) -> None: +def one_comm_coll_node_allreduce(num_npus: int, comm_size: int) -> None: for npu_id in range(num_npus): output_filename = f"one_comm_coll_node_allreduce.{npu_id}.et" with open(output_filename, "wb") as et: @@ -206,12 +202,10 @@ def one_comm_coll_node_allreduce(num_npus: int, num_dims: int, comm_size: int) - node.attr.append(ChakraAttr(name="is_cpu_op", bool_val=False)) node.attr.append(get_comm_type_attr(ALL_REDUCE)) node.attr.append(ChakraAttr(name="comm_size", uint64_val=comm_size)) - attr = get_involved_dim_attr(num_dims) - node.attr.append(attr) encode_message(et, node) -def one_comm_coll_node_alltoall(num_npus: int, num_dims: int, comm_size: int) -> None: +def one_comm_coll_node_alltoall(num_npus: int, comm_size: int) -> None: for npu_id in range(num_npus): output_filename = f"one_comm_coll_node_alltoall.{npu_id}.et" with open(output_filename, "wb") as et: @@ -221,12 +215,10 @@ def one_comm_coll_node_alltoall(num_npus: int, num_dims: int, comm_size: int) -> node.attr.append(ChakraAttr(name="is_cpu_op", bool_val=False)) node.attr.append(get_comm_type_attr(ALL_TO_ALL)) node.attr.append(ChakraAttr(name="comm_size", uint64_val=comm_size)) - attr = get_involved_dim_attr(num_dims) - node.attr.append(attr) encode_message(et, node) -def one_comm_coll_node_allgather(num_npus: int, num_dims: int, comm_size: int) -> None: +def one_comm_coll_node_allgather(num_npus: int, comm_size: int) -> None: for npu_id in range(num_npus): output_filename = f"one_comm_coll_node_allgather.{npu_id}.et" with open(output_filename, "wb") as et: @@ -236,12 +228,10 @@ def one_comm_coll_node_allgather(num_npus: int, num_dims: int, comm_size: int) - node.attr.append(ChakraAttr(name="is_cpu_op", bool_val=False)) node.attr.append(get_comm_type_attr(ALL_GATHER)) node.attr.append(ChakraAttr(name="comm_size", uint64_val=comm_size)) - attr = get_involved_dim_attr(num_dims) - node.attr.append(attr) encode_message(et, node) -def one_comm_coll_node_reducescatter(num_npus: int, num_dims: int, comm_size: int) -> None: +def one_comm_coll_node_reducescatter(num_npus: int, comm_size: int) -> None: for npu_id in range(num_npus): output_filename = f"one_comm_coll_node_reducescatter.{npu_id}.et" with open(output_filename, "wb") as et: @@ -251,8 +241,6 @@ def one_comm_coll_node_reducescatter(num_npus: int, num_dims: int, comm_size: in node.attr.append(ChakraAttr(name="is_cpu_op", bool_val=False)) node.attr.append(get_comm_type_attr(REDUCE_SCATTER)) node.attr.append(ChakraAttr(name="comm_size", uint64_val=comm_size)) - attr = get_involved_dim_attr(num_dims) - node.attr.append(attr) encode_message(et, node) @@ -266,12 +254,6 @@ def main() -> None: default=64, help="Number of NPUs" ) - parser.add_argument( - "--num_dims", - type=int, - default=2, - help="Number of dimensions in the network topology" - ) parser.add_argument( "--default_runtime", type=int, @@ -301,10 +283,10 @@ def main() -> None: two_comp_nodes_independent(args.num_npus, args.default_runtime) two_comp_nodes_dependent(args.num_npus, args.default_runtime) - one_comm_coll_node_allreduce(args.num_npus, args.num_dims, args.default_comm_size) - one_comm_coll_node_alltoall(args.num_npus, args.num_dims, args.default_comm_size) - one_comm_coll_node_allgather(args.num_npus, args.num_dims, args.default_comm_size) - one_comm_coll_node_reducescatter(args.num_npus, args.num_dims, args.default_comm_size) + one_comm_coll_node_allreduce(args.num_npus, args.default_comm_size) + one_comm_coll_node_alltoall(args.num_npus, args.default_comm_size) + one_comm_coll_node_allgather(args.num_npus, args.default_comm_size) + one_comm_coll_node_reducescatter(args.num_npus, args.default_comm_size) if __name__ == "__main__":