# This is the default configuration of TransferQueue. Users may modify the default value
# and use transfer_queue.init(conf) to overwrite the config entries.
# Prometheus metrics exporter.
metrics:
enabled: false
# HTTP port for /metrics endpoint (0 = auto-assign free port)
port: 0
controller:
# User-defined sampler. User can pass sampler instance to overwrite this string config.
sampler: SequentialSampler
# Whether return an empty BatchMeta to prevent request blocking when no enough data is available
polling_mode: False
# ZMQ Server IP & Ports (automatically generated during init)
zmq_info: null
backend:
# Pluggable storage/transport backend of TransferQueue. Choose from:
# SimpleStorage, Yuanrong, MooncakeStore, ...
storage_backend: SimpleStorage
# SimpleStorage: ZMQ-based in-memory storage for out-of-the-box usage
SimpleStorage:
# Maximum number of experience samples to hold across all storage units
total_storage_size: 100000
# Number of distributed storage units.
# Recommended: >= 2 x number of nodes for load balancing.
num_data_storage_units: 2
# ZMQ Server IP & Ports (automatically generated during init)
zmq_info: null
# MooncakeStore: high-performance KV-based hierarchical storage
# that supports RDMA transport between GPU and DRAM.
MooncakeStore:
# Whether TransferQueue should automatically start the Mooncake metadata server.
# WARNING: When set to `true`, TQ will attempt to terminate any existing mooncake_master process.
auto_init: true
# Address of the metadata coordination server.
metadata_server: localhost:50050
# Address of the Mooncake master server.
master_server_address: localhost:50051
# Local host address visible to the Mooncake cluster.
# Set to "" to auto-detect using Ray's node IP.
local_hostname: ""
# Transport protocol. Choose from: tcp, rdma.
protocol: tcp
# Global memory segment size in bytes **per client** for mounting (default: 4GB)
global_segment_size: 4294967296
# Local buffer size in bytes **per client** (default: 1GB)
local_buffer_size: 1073741824
# Network device name.
# Set to "" to let Mooncake auto-select available devices.
device_name: ""
# For Yuanrong:
Yuanrong:
# Whether to let TQ automatically init yuanrong
auto_init: True
# Datasystem worker port
worker_port: 31501
# Metastore service port
metastore_port: 2379
# Whether to enable npu transport
enable_yr_npu_transport: false
# Whether to enable host RDMA (H2H) transport via UCX. Requires RDMA NIC hardware and rdma-core driver.
# See https://pages.openeuler.openatom.cn/openyuanrong-datasystem/docs/zh-cn/latest/best_practices/best_practices_for_rdma.html
enable_rdma: false
# UCX env vars passed to dscli subprocess. Precedence: ucx_env_vars > parent env > TQ default (UCX_TLS=rc_x when enable_rdma=true).
# UCX_TLS: RDMA transport mode. "rc_x" (default when RDMA enabled and unset), "rc" (compatible), "ud" (low-latency), "dc" (large-scale).
# UCX_LOG_FILE: Path to UCX log file. Requires UCX_LOG_LEVEL to be set.
# UCX_LOG_LEVEL: FATAL, ERROR, WARN, INFO, DEBUG, TRACE. Use DEBUG/TRACE for troubleshooting.
# UCX_NET_DEVICES: RDMA device name (e.g., mlx5_0:1). Required in multi-NIC setups.
# UCX_TCP_CM_ROUTE: TCP control-flow interface for UCX connection setup.
# Example: ucx_env_vars: { UCX_TLS: rc_x, UCX_LOG_FILE: /tmp/ucx.log, UCX_LOG_LEVEL: ERROR }
ucx_env_vars: {}
# Additional config for yuanrong worker.
# Recommended options for NPU environments:
# --remote_h2d_device_ids Enable RH2D for efficient cross-node data transfer. Specify NPU device IDs (comma-separated).
# --enable_huge_tlb Enable huge page memory to improve performance. Required for >21GB shared memory on 910B.
# Before enabling, OS config required (root privilege):
# sysctl -w vm.nr_hugepages=<count> (each page is 2MB, e.g. 65536 for 128GB)
# ulimit -l unlimited (allow pinning enough memory for RDMA/Ascend)
# Example: "--shared_memory_size_mb 16384 --remote_h2d_device_ids 0,1,2,3 --enable_huge_tlb true"
worker_args: "--shared_memory_size_mb 8192"
# For RayStore:
RayStore: