Skip to content

Commit

Permalink
Update Megatron-LM scripts and integration for latest Docker containe…
Browse files Browse the repository at this point in the history
…r. (#55)

* Update megatron-lm and scripts for new container.
  • Loading branch information
tgale96 committed Dec 11, 2023
1 parent 059542d commit 396de2a
Show file tree
Hide file tree
Showing 18 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion exp/dmoe/dmoe_125m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/dmoe/dmoe_356m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ EVALUATION_ARGUMENTS="\
--eval-interval 1000"


python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/dmoe/dmoe_46m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/dmoe/dmoe_760m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_125m_1gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_125m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_1315m_1gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_1315m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_356m_1gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_356m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_46m_1gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_46m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_760m_1gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/gpt2/gpt2_760m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MODEL_ARGUMENTS} \
${TRAINING_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/moe/moe_125m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/moe/moe_356m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion exp/moe/moe_46m_8gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ EVALUATION_ARGUMENTS="\
--log-interval 100 \
--eval-interval 1000"

python -m torch.distributed.launch ${DISTRIBUTED_ARGUMENTS} \
torchrun ${DISTRIBUTED_ARGUMENTS} \
third_party/Megatron-LM/pretrain_gpt.py \
${MOE_ARGUMENTS} \
${MODEL_ARGUMENTS} \
Expand Down
2 changes: 1 addition & 1 deletion third_party/Megatron-LM

0 comments on commit 396de2a

Please sign in to comment.