Skip to content

Commit

Permalink
Set additional flags for a3 and a3plus
Browse files Browse the repository at this point in the history
  • Loading branch information
michelle-yooh committed Jun 5, 2024
1 parent d185209 commit eafc6af
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 82 deletions.
7 changes: 0 additions & 7 deletions MaxText/configs/a3/llama_2_7b/16vm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
echo "Running 16vm.sh"
# Example command to invoke this script
# bash MaxText/configs/a3/llama_2_7b/16vm.sh
#
# Example command to invoke this script via XPK
# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
Expand All @@ -20,10 +17,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION
export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0
Expand Down
8 changes: 1 addition & 7 deletions MaxText/configs/a3/llama_2_7b/1vm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
echo "Running 1vm.sh"
# Example command to invoke this script
# bash MaxText/configs/a3/llama_2_7b/1vm.sh
#

# Example command to invoke this script via XPK
# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
Expand All @@ -20,10 +18,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION
export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
--xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true
Expand Down
8 changes: 1 addition & 7 deletions MaxText/configs/a3/llama_2_7b/2vm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
echo "Running 2vm.sh"
# Example command to invoke this script
# bash MaxText/configs/a3/llama_2_7b/2vm.sh
#

# Example command to invoke this script via XPK
# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
Expand All @@ -20,10 +18,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION
export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0
Expand Down
7 changes: 0 additions & 7 deletions MaxText/configs/a3/llama_2_7b/4vm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
echo "Running 4vm.sh"
# Example command to invoke this script
# bash MaxText/configs/a3/llama_2_7b/4vm.sh
#
# Example command to invoke this script via XPK
# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
Expand All @@ -20,10 +17,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION
export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
--xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true
Expand Down
7 changes: 0 additions & 7 deletions MaxText/configs/a3/llama_2_7b/8vm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
echo "Running 8vm.sh"
# Example command to invoke this script
# bash MaxText/configs/a3/llama_2_7b/8vm.sh
#
# Example command to invoke this script via XPK
# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
Expand All @@ -20,10 +17,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION
export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true
--xla_gpu_enable_triton_gemm=false --xla_gpu_graph_level=0
Expand Down
5 changes: 1 addition & 4 deletions end_to_end/gpu/a3/test_llama2_7b.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash


# This file is both an integration test that runs once a day on a A3 and documentation for how to get started with Llama2-7b

# The flow of this file is as follows:
Expand Down Expand Up @@ -47,10 +48,6 @@ for ARGUMENT in "$@"; do
export "$KEY"="$VALUE"
done

export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FUSED_ATTN=1
export NCCL_DEBUG=VERSION

export XLA_FLAGS="--xla_dump_to=$BASE_OUTPUT_PATH/$RUN_NAME/HLO_dumps/
--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
Expand Down
88 changes: 45 additions & 43 deletions gpu_multi_process_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,55 +16,57 @@ export JAX_COORDINATOR_PORT=$JAX_COORDINATOR_PORT
export JAX_COORDINATOR_ADDRESS=$JAX_COORDINATOR_ADDRESS

set_nccl_gpudirect_tcpx_specific_configuration() {
if [[ "$USE_GPUDIRECT" == "tcpx" ]]; then
echo "Using GPUDirect-TCPX"
if [[ "$USE_GPUDIRECT" == "tcpx" ]] || [[ "$USE_GPUDIRECT" == "fastrak" ]]; then
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_DEBUG=INFO
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_P2P_PXN_LEVEL=0
export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64"
export NCCL_GPUDIRECTTCPX_FORCE_ACK=0
export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=1
export NCCL_MAX_NCHANNELS=12
export NCCL_MIN_NCHANNELS=12
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
export NCCL_SOCKET_IFNAME=eth0
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191"
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4
export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_NVLS_ENABLE=0
elif [[ "$USE_GPUDIRECT" == "fastrak" ]]; then
echo "Using GPUDirect-TCPFasTrak"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpxo/lib64"
export NCCL_FASTRAK_CTRL_DEV=eth0
export NCCL_FASTRAK_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
export NCCL_SOCKET_IFNAME=eth0
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=4
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export NCCL_FASTRAK_NUM_FLOWS=2
export NCCL_FASTRAK_USE_SNAP=1
export NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=0
export NCCL_BUFFSIZE=8388608
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0
export NCCL_FASTRAK_USE_LLCM=1
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_PROTO=Simple
export NCCL_SOCKET_IFNAME=eth0
export NVTE_FUSED_ATTN=1
export TF_CPP_MAX_LOG_LEVEL=100
export TF_CPP_VMODULE=profile_guided_latency_estimator=10
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85

if [[ "$USE_GPUDIRECT" == "tcpx" ]]; then
echo "Using GPUDirect-TCPX"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64"
export NCCL_ALGO=Ring
export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION
export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0
export NCCL_GPUDIRECTTCPX_FORCE_ACK=0
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191"
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
export NCCL_MAX_NCHANNELS=12
export NCCL_MIN_NCHANNELS=12
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_P2P_PXN_LEVEL=0
export NCCL_SOCKET_NTHREADS=1
elif [[ "$USE_GPUDIRECT" == "fastrak" ]]; then
echo "Using GPUDirect-TCPFasTrak"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpxo/lib64"
export NCCL_ALGO=Ring,Tree
export NCCL_BUFFSIZE=8388608
export NCCL_FASTRAK_CTRL_DEV=eth0
export NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=0
export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0
export NCCL_FASTRAK_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
export NCCL_FASTRAK_NUM_FLOWS=2
export NCCL_FASTRAK_USE_LLCM=1
export NCCL_FASTRAK_USE_SNAP=1
export NCCL_MIN_NCHANNELS=4
export NCCL_TUNER_CONFIG_PATH=/usr/local/nvidia/lib64/a3plus_tuner_config.textproto
export NCCL_TUNER_PLUGIN=libnccl-tuner.so
fi
else
echo "NOT using GPUDirect"
fi
Expand Down

0 comments on commit eafc6af

Please sign in to comment.