Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/workflows/pr-test-npu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
- name: Run test intranode
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2300
HCCL_BUFFSIZE: 3000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py

Expand All @@ -71,7 +71,8 @@ jobs:
- name: Run test low latency
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 1913
HCCL_BUFFSIZE: 3000
MOE_ENABLE_TOPK_NEG_ONE: 1
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1
Expand Down Expand Up @@ -114,6 +115,7 @@ jobs:
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 3000
MOE_ENABLE_TOPK_NEG_ONE: 1
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py

Expand Down Expand Up @@ -160,14 +162,15 @@ jobs:
- name: Run test intranode
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2300
HCCL_BUFFSIZE: 3000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py

- name: Run test low latency
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 1913
HCCL_BUFFSIZE: 3000
MOE_ENABLE_TOPK_NEG_ONE: 1
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1
Expand Down Expand Up @@ -210,6 +213,7 @@ jobs:
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 3000
MOE_ENABLE_TOPK_NEG_ONE: 1
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py

Expand Down
18 changes: 17 additions & 1 deletion tests/python/deepep/test_internode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import os
import random
import time
from typing import Optional

Expand Down Expand Up @@ -34,13 +35,28 @@ def test_main(
group: dist.ProcessGroup,
):
# Settings
num_tokens, hidden = args.num_tokens, args.hidden
base_num_tokens, hidden = args.num_tokens, args.hidden
num_topk, num_experts = args.num_topk, args.num_experts
enable_diagnose = args.enable_diagnose
num_servers = num_ranks // num_local_ranks
num_nodes = num_servers
expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1))

fluctuation_percentage = 0.1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A switch needs to be added for dynamic BS testing.

min_fluctuation = 2

if base_num_tokens < 10:
fluctuation = random.randint(-min_fluctuation, min_fluctuation)
num_tokens = base_num_tokens + fluctuation
else:
fluctuation = random.uniform(
1 - fluctuation_percentage, 1 + fluctuation_percentage
)
num_tokens = int(base_num_tokens * fluctuation)

# Ensure num_tokens is at least 1
num_tokens = max(num_tokens, 1)

assert num_experts % num_ranks == 0 and num_nodes >= 2
assert num_tokens <= MAX_BATCH_SIZE
if local_rank == 0:
Expand Down
18 changes: 17 additions & 1 deletion tests/python/deepep/test_intranode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import os
import random
import time
from typing import Optional

Expand Down Expand Up @@ -30,12 +31,27 @@ def test_main(
group: dist.ProcessGroup,
):
# Settings
num_tokens, hidden = args.num_tokens, args.hidden
base_num_tokens, hidden = args.num_tokens, args.hidden
num_topk, num_experts = args.num_topk, args.num_experts
enable_diagnose = args.enable_diagnose
num_servers = num_ranks // num_local_ranks
expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1))

fluctuation_percentage = 0.1
min_fluctuation = 2

if base_num_tokens < 10:
fluctuation = random.randint(-min_fluctuation, min_fluctuation)
num_tokens = base_num_tokens + fluctuation
else:
fluctuation = random.uniform(
1 - fluctuation_percentage, 1 + fluctuation_percentage
)
num_tokens = int(base_num_tokens * fluctuation)

# Ensure num_tokens is at least 1
num_tokens = max(num_tokens, 1)

assert num_experts % num_ranks == 0
if local_rank == 0:
print(
Expand Down
Loading