#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HYDRA_FULL_ERROR=1
export CLOSE_MATMUL_K_SHIFT=1

GPUS_PER_NODE=16
MASTER_ADDR=localhost
MASTER_PORT=6005
NNODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
"

torchrun $DISTRIBUTED_ARGS cli/train_dpo.py \
    --config-name dpo_qwen3_30b_a3b_A3 \
    | tee logs/RL_dpo_qwen3_30b_a3b_rank${NODE_RANK}.log