Checklist / 检查清单
Bug Description / Bug 描述
一线AI服务使能过程中,Atlas A3设备网络调度,不支持背靠背直连hccs api接口,在走灵衢的双机正常拉起,背靠背npu直连的双机,同样的脚本无法拉起
#!/usr/bin/env bash
set -euo pipefail
HOST_TAG=$(hostname)
export HF_HOME=/sharedata/.cache/${HOST_TAG}/huggingface
export HF_DATASETS_CACHE=/sharedata/.cache/${HOST_TAG}/hf_datasets
export MODELSCOPE_CACHE=/sharedata/.cache/${HOST_TAG}/modelscope
export HUGGINGFACE_HUB_CACHE=/sharedata/.cache/${HOST_TAG}/huggingface/hub
export TRITON_CACHE_DIR=/sharedata/.cache/${HOST_TAG}/triton
export TMPDIR=/tmp/${HOST_TAG}
mkdir -p "$HF_HOME" "$HF_DATASETS_CACHE" "$MODELSCOPE_CACHE" "$HUGGINGFACE_HUB_CACHE" "$TRITON_CACHE_DIR" "$TMPDIR"
#===========================================================================
export NCCL_SOCKET_IFNAME=enp196s0f0 # 替换为实际网卡名
export NCCL_IB_DISABLE=1 # 如果没有Infiniband,禁用IB
export NCCL_P2P_DISABLE=0
export GLOO_SOCKET_IFNAME=enp196s0f0 #同样替换为实际网卡
#===========================================================================
export HCCL_DEBUG=INFO
export HCCL_DEBUG_SUBSYS=INIT,NET
export TORCH_DISTRIBUTED_DEBUG=DETAIL
#================== distributed config =====================================
export MASTER_ADDR=195.27.8.2
export MASTER_PORT=29999
export NNODES=2
export NODE_RANK=1
export NPROC_PER_NODE=16
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
#=================== training config ========================================
export RUN_NAME=qwen3-0.6b-music-train-cus
MODEL_PATH="./qwen3-0.6b-music"
OUTPUT_DIR="${RUN_NAME}"
mkdir -p "${OUTPUT_DIR}"
echo "Starting multi-node training..."
echo "MASTER_ADDR=${MASTER_ADDR}"
echo "MASTER_PORT=${MASTER_PORT}"
echo "NNODES=${NNODES}, NODE_RANK=${NODE_RANK}, NPROC_PER_NODE=${NPROC_PER_NODE}"
sleep 5
nproc_per_node=16
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
swift sft
--model "${MODEL_PATH}"
--tuner_type full
--model_type qwen3
--template qwen3
--dataset './Muse_train/train_cn.jsonl' './Muse_train/train_en.jsonl'
--val_dataset './Muse_train/val.jsonl'
--num_train_epochs 5
--learning_rate 5e-4
--per_device_train_batch_size 1
--gradient_accumulation_steps 4
--eval_steps 100
--save_steps 1000
--save_total_limit 20
--save_only_model true
--logging_steps 10
--max_length 9000
--output_dir "${OUTPUT_DIR}"
--warmup_ratio 0.005
--dataloader_num_workers 32
--dataset_num_proc 16
--deepspeed zero2
--max_grad_norm 1.0
--weight_decay 0.1
--report_to tensorboard
2>&1 | tee "${OUTPUT_DIR}/train_node_1.log"
How to Reproduce / 如何复现
Additional Information / 补充信息
No response
Checklist / 检查清单
Bug Description / Bug 描述
一线AI服务使能过程中,Atlas A3设备网络调度,不支持背靠背直连hccs api接口,在走灵衢的双机正常拉起,背靠背npu直连的双机,同样的脚本无法拉起
#!/usr/bin/env bash
set -euo pipefail
HOST_TAG=$(hostname)
export HF_HOME=/sharedata/.cache/${HOST_TAG}/huggingface
export HF_DATASETS_CACHE=/sharedata/.cache/${HOST_TAG}/hf_datasets
export MODELSCOPE_CACHE=/sharedata/.cache/${HOST_TAG}/modelscope
export HUGGINGFACE_HUB_CACHE=/sharedata/.cache/${HOST_TAG}/huggingface/hub
export TRITON_CACHE_DIR=/sharedata/.cache/${HOST_TAG}/triton
export TMPDIR=/tmp/${HOST_TAG}
mkdir -p "$HF_HOME" "$HF_DATASETS_CACHE" "$MODELSCOPE_CACHE" "$HUGGINGFACE_HUB_CACHE" "$TRITON_CACHE_DIR" "$TMPDIR"
#===========================================================================
export NCCL_SOCKET_IFNAME=enp196s0f0 # 替换为实际网卡名
export NCCL_IB_DISABLE=1 # 如果没有Infiniband,禁用IB
export NCCL_P2P_DISABLE=0
export GLOO_SOCKET_IFNAME=enp196s0f0 #同样替换为实际网卡
#===========================================================================
export HCCL_DEBUG=INFO
export HCCL_DEBUG_SUBSYS=INIT,NET
export TORCH_DISTRIBUTED_DEBUG=DETAIL
#================== distributed config =====================================
export MASTER_ADDR=195.27.8.2
export MASTER_PORT=29999
export NNODES=2
export NODE_RANK=1
export NPROC_PER_NODE=16
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
#=================== training config ========================================
export RUN_NAME=qwen3-0.6b-music-train-cus
MODEL_PATH="./qwen3-0.6b-music"
OUTPUT_DIR="${RUN_NAME}"
mkdir -p "${OUTPUT_DIR}"
echo "Starting multi-node training..."
echo "MASTER_ADDR=${MASTER_ADDR}"
echo "MASTER_PORT=${MASTER_PORT}"
echo "NNODES=${NNODES}, NODE_RANK=${NODE_RANK}, NPROC_PER_NODE=${NPROC_PER_NODE}"
sleep 5
nproc_per_node=16
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
swift sft
--model "${MODEL_PATH}"
--tuner_type full
--model_type qwen3
--template qwen3
--dataset './Muse_train/train_cn.jsonl' './Muse_train/train_en.jsonl'
--val_dataset './Muse_train/val.jsonl'
--num_train_epochs 5
--learning_rate 5e-4
--per_device_train_batch_size 1
--gradient_accumulation_steps 4
--eval_steps 100
--save_steps 1000
--save_total_limit 20
--save_only_model true
--logging_steps 10
--max_length 9000
--output_dir "${OUTPUT_DIR}"
--warmup_ratio 0.005
--dataloader_num_workers 32
--dataset_num_proc 16
--deepspeed zero2
--max_grad_norm 1.0
--weight_decay 0.1
--report_to tensorboard
2>&1 | tee "${OUTPUT_DIR}/train_node_1.log"
How to Reproduce / 如何复现
Additional Information / 补充信息
No response