From 7e67a9d18ce01dec4917319ef9ff4a9fc70d8461 Mon Sep 17 00:00:00 2001
From: Eusford_0526 <sunzijian4@huawei.com>
Date: Mon, 12 Jan 2026 17:15:54 +0800
Subject: [PATCH] local reconstruction via lighthouse

---
 src/Makefile             |    2 +-
 src/bootstrap.cc         |  475 +++++++--
 src/channel.cc           |   86 +-
 src/graph/connect.cc     | 1013 ++++++++++---------
 src/include/bootstrap.h  |   74 +-
 src/include/channel.h    |    1 +
 src/include/comm.h       |   46 +-
 src/include/lighthouse.h |   34 +
 src/include/scale.h      |   55 +
 src/include/serialize.h  |  507 ++++++++++
 src/init.cc              | 2079 +++++++++++++++++++++++++++++++++-----
 src/lighthouse.cc        |  339 +++++++
 src/misc/shmutils.cc     |   22 +-
 src/misc/socket.cc       | 1960 +++++++++++++++++------------------
 src/nccl.h.in            |  524 +++++-----
 src/scale.cc             |  673 ++++++++++++
 16 files changed, 5778 insertions(+), 2112 deletions(-)
 create mode 100644 src/include/lighthouse.h
 create mode 100644 src/include/scale.h
 create mode 100644 src/include/serialize.h
 create mode 100644 src/lighthouse.cc
 create mode 100644 src/scale.cc

diff --git a/src/Makefile b/src/Makefile
index eab662e..a9e9406 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc scale.cc lighthouse.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index f053372..77712e0 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,14 +1,15 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #include "nccl.h"
 #include "core.h"
 #include "utils.h"
 #include "bootstrap.h"
 #include "net.h"
+#include "lighthouse.h"
 #include <unistd.h>
 #include <sys/types.h>
 #include "proxy.h"
@@ -42,11 +43,33 @@
   } while (0)
 
 #define BOOTSTRAP_PID(i, n) (((i) + (n)) % (n))
+
+// struct ncclCommTrans {
+//   struct ncclTopoRanks* peerTopo;//长度nRanks
+//   struct ncclPeerInfo* peerInfo;
+//   int* nodesFirstRank;//长度nRanks
+//   int* nodesTreePatterns;//长度nRanks
+//   int* ringPrev;//长度nRanks*MAXCHANNELS
+//   int* ringNext;//长度nRanks*MAXCHANNELS
+//   int* peerRings;//长度nRanks*MAXCHANNELS
+//   void* bootstrap;
+//   int nRanks;  // number of GPUs in communicator
+//   int cudaDev; 
+//   int* rankToNode;
+//   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
 // returns the first rank associated to the root. must have root >=0
 // if root >= n_roots, it does NOT assume periodicity
 static int firstRankFromRoot(int root, int n_ranks, int nRoots) {
   return root * (n_ranks / nRoots) + std::min(root, n_ranks % nRoots);
 }
+
+// typedef struct {
+//   int socket_fd;
+//   int rank_id;
+//   char ip_address[INET_ADDRSTRLEN];
+// } RankConnection;
 // returns the root of a rank, must have rank >=0
 // if rank >= n_ranks, it does NOT assume periodicity
 static int rootIdFromRank(int rank, int nRanks, int nRoots) {
@@ -79,6 +102,7 @@ static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
 struct bootstrapRootArgs {
   struct ncclSocket* listenSock;
   uint64_t magic;
+  int sock;
 };
 
 /* Init functions */
@@ -103,7 +127,7 @@ ncclResult_t bootstrapNetInit() {
           return ncclInvalidArgument;
         }
         NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
-                                               &nIfs));
+                                              &nIfs));
         if (nIfs <= 0) {
           WARN("NET/Socket : No usable listening interface found");
           pthread_mutex_unlock(&bootstrapNetLock);
@@ -128,6 +152,7 @@ ncclResult_t bootstrapNetInit() {
   return ncclSuccess;
 }
 
+
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
 
@@ -153,7 +178,7 @@ static ncclResult_t netDereg(ncclNet_t* net, void* comm, void** handle) {
   return ncclSuccess;
 }
 static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int size, void* dataHandle, int tag, void** sendReq,
-                             int* done) {
+                            int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
     NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
@@ -167,7 +192,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
   return ncclSuccess;
 }
 static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int size, void* dataHandle, int tag, void** recvReq,
-                             int* done) {
+                            int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
     size_t size64 = size;
@@ -218,7 +243,7 @@ static ncclResult_t socketRecv(struct ncclSocket* sock, void* data, int size) {
   return ncclSuccess;
 }
 static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock,
-                                   void* recvData, int recvSize) {
+                                  void* recvData, int recvSize) {
   int senderRecvSize;
   NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int)));
   if (senderRecvSize > recvSize) {
@@ -267,7 +292,8 @@ fail:
   (void)ncclSocketClose(&sock);
   return res;
 }
-static void* bootstrapRoot(void* rargs) {
+void printNcclSocketAddress(union ncclSocketAddress *addr);
+static void* bootstrapRoot(void* rargs) {//这个地方可能重复
   uint64_t timers[BOOTSTRAP_INIT_ROOT_N] = {0};
   struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
   struct ncclSocket* listenSock = args->listenSock;
@@ -278,11 +304,14 @@ static void* bootstrapRoot(void* rargs) {
   int nrecv = 0, n2send = 0;
   struct extInfo info;
   union ringConnectInfo* rankInfo = NULL;
+  union ncclSocketAddress* nextPeerAddrInfo = NULL;
   union ncclSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange
   // get zeros for comparison
   char zeroHandle[NCCL_NET_HANDLE_MAXSIZE];
   union ncclSocketAddress zeroAddress;
   union ringConnectInfo zeroInfo;
+  struct LhTxn* lhTxn = NULL;
+  struct LhState* lhState = NULL;
   memset(&zeroAddress, 0, sizeof(union ncclSocketAddress));
   memset(&zeroHandle, 0, NCCL_NET_HANDLE_MAXSIZE);
   memset(&zeroInfo, 0, sizeof(union ringConnectInfo));
@@ -309,6 +338,7 @@ static void* bootstrapRoot(void* rargs) {
       nrecv = n2send + ((nroots > 1) ? 1 : 0);
       NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out);
       NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&nextPeerAddrInfo, nrecv), res, out);
     }
 
     if (nranks != info.nranks || nroots != info.nroots || iroot != info.iroot) {
@@ -328,8 +358,13 @@ static void* bootstrapRoot(void* rargs) {
     int prev = (nroots > 1) ? (localId - 1) : BOOTSTRAP_PID(localId - 1, nrecv);
     if (prev >= 0 && prev < n2send && memcmp(&zeroAddress, &rankAddressesRoot[prev], sizeof(union ncclSocketAddress)) != 0) {
       NCCLCHECKGOTO(rootSend(&rankAddressesRoot[prev], magic, &info.connectInfo), res, out);
+      memcpy(&nextPeerAddrInfo[info.rank], &info.connectInfo.addr, sizeof(union ncclSocketAddress));
     } else {
       memcpy(&rankInfo[localId], &info.connectInfo, sizeof(union ringConnectInfo));
+      memcpy(&nextPeerAddrInfo[info.rank], &info.connectInfo.addr, sizeof(union ncclSocketAddress));
+      //memcpy(&nextPeerAddrInfo[info.rank],&info.connectInfo.addr,sizeof(union ncclSocketAddress));//估计没问题
+      //printNcclSocketAddress(&info.connectInfo.addr);
+      //printNcclSocketAddress(&nextPeerAddrInfo[1]);
     }
     // if the next rank has checked in, send the newly received info, if not save the addr for later
     // for nroots >=1, I will always own the information of the next connection
@@ -339,10 +374,38 @@ static void* bootstrapRoot(void* rargs) {
       NCCLCHECKGOTO(rootSend(&info.listenRootAddress, magic, &rankInfo[next]), res, out);
     } else {
       memcpy(rankAddressesRoot + localId, &info.listenRootAddress, sizeof(union ncclSocketAddress));
+      //memcpy(&nextPeerAddrInfo[info.rank], &info.listenRootAddress, sizeof(union ncclSocketAddress));
     }
     ++c;
     TRACE(NCCL_BOOTSTRAP, "Received connect from rank %d total %d/%d", info.rank, c, nrecv);
   } while (c < nrecv);
+  INFO(NCCL_INIT,"bootstrapRoot nrecv %d",nrecv); 
+  printNcclSocketAddress(&nextPeerAddrInfo[0]);
+  printNcclSocketAddress(&nextPeerAddrInfo[1]);
+
+  if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+    fprintf(stderr, "lighthouse: txnBegin failed");
+    res = ncclInternalError;
+    goto out;
+  }
+  if (txnLoad(lhTxn, &lhState) != 0) {
+    fprintf(stderr, "lighthouse: txnLoad failed");
+    res = ncclInternalError;
+    goto out;
+  }
+  initialize(lhState, nextPeerAddrInfo, nrecv, magic);
+  printLhState(lhState);
+  if (txnSave(lhTxn, lhState) != 0) {
+    fprintf(stderr, "lighthouse: txnSave failed");
+    res = ncclInternalError;
+    goto out;
+  }
+  if (txnEnd(lhTxn) != 0) {
+    fprintf(stderr, "lighthouse: txnEnd failed");
+    res = ncclInternalError;
+    goto out;
+  }
+
   TRACE(NCCL_BOOTSTRAP, "COLLECTED ALL %d HANDLES", nrecv);
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_RECV]);
 
@@ -352,6 +415,7 @@ static void* bootstrapRoot(void* rargs) {
   for (int r = 0; r < n2send; ++r) {
     // use nrecv to periodize: if 1 root, we will send the first one to the last one, if >1 roots we will send the additional one we have received
     int next = BOOTSTRAP_PID(r + 1, nrecv);
+    //printNcclSocketAddress(&rankInfo[next].addr);
     if (memcmp(&zeroAddress, &rankAddressesRoot[r], sizeof(union ncclSocketAddress)) != 0 &&
         memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) {
       NCCLCHECKGOTO(rootSend(&rankAddressesRoot[r], magic, &rankInfo[next]), res, out);
@@ -359,6 +423,7 @@ static void* bootstrapRoot(void* rargs) {
   }
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_SEND]);
   TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "Root timings (wait %f, recv %f, send %f)", timers[BOOTSTRAP_INIT_ROOT_WAIT] / 1e9, timers[BOOTSTRAP_INIT_ROOT_RECV] / 1e9, timers[BOOTSTRAP_INIT_ROOT_SEND] / 1e9);
+  INFO(NCCL_INIT,"bootstrapRoot DONE"); 
 out:
   if (listenSock != NULL) {
     (void)ncclSocketClose(listenSock);
@@ -369,6 +434,10 @@ out:
   if (rankAddressesRoot)
     free(rankAddressesRoot);
   free(rargs);
+  if (lhTxn)
+    free(lhTxn);
+  if (lhState)
+    free(lhState);
 
   TRACE(NCCL_BOOTSTRAP, "DONE");
   return NULL;
@@ -419,57 +488,57 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
   return ncclSuccess;
 }
 
-struct unexConn {
-  int peer;
-  int tag;
-  struct ncclSocket sock;
-  struct unexConn* next;
-};
-
-struct bootstrapRing_t {
-  union {
-    struct {
-      void *sendComm, *recvComm;
-      ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
-    } net;
-    struct {
-      struct ncclSocket recv;
-      struct ncclSocket send;
-    } socket;
-  };
-};
-struct bootstrapListen_t {
-  struct ncclSocket peerSocket; // socket for peers to contact me in P2P
-  union {
-    struct {
-      int dev;
-      void* comm;
-      char handle[NCCL_NET_HANDLE_MAXSIZE];
-    } net;
-    struct ncclSocket socket; // socket to be used for the ring
-  };
-};
-
-struct bootstrapState {
-  struct bootstrapRing_t ring;
-  struct bootstrapListen_t listen;
-  ncclNet_t* net;
-  uint64_t* peerProxyAddressesUDS;
-  union ncclSocketAddress* peerProxyAddresses;
-  union ncclSocketAddress* peerP2pAddresses;
-  struct unexConn* unexpectedConnections;
-  int cudaDev;
-  int rank;
-  int nranks;
-  uint64_t magic;
-  volatile uint32_t* abortFlag;
-};
+// struct unexConn {
+//   int peer;
+//   int tag;
+//   struct ncclSocket sock;
+//   struct unexConn* next;
+// };
+
+// struct bootstrapRing_t {
+//   union {
+//     struct {
+//       void *sendComm, *recvComm;
+//       ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
+//     } net;
+//     struct {
+//       struct ncclSocket recv;
+//       struct ncclSocket send;
+//     } socket;
+//   };
+// };
+// struct bootstrapListen_t {
+//   struct ncclSocket peerSocket; // socket for peers to contact me in P2P
+//   union {
+//     struct {
+//       int dev;
+//       void* comm;
+//       char handle[NCCL_NET_HANDLE_MAXSIZE];
+//     } net;
+//     struct ncclSocket socket; // socket to be used for the ring
+//   };
+// };
+
+// struct bootstrapState {
+//   struct bootstrapRing_t ring;
+//   struct bootstrapListen_t listen;
+//   ncclNet_t* net;
+//   uint64_t* peerProxyAddressesUDS;
+//   union ncclSocketAddress* peerProxyAddresses;
+//   union ncclSocketAddress* peerP2pAddresses;
+//   struct unexConn* unexpectedConnections;
+//   int cudaDev;
+//   int rank;
+//   int nranks;
+//   uint64_t magic;
+//   volatile uint32_t* abortFlag;
+// };
 #define STATE_RING(s, f) (s->ring.f)
 #define STATE_LISTEN(s, f) (s->listen.f)
 
 // helper functions
-static ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
-                                       ncclSocketType type) {
+ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
+                                      ncclSocketType type) {
   NCCLCHECK(ncclSocketInit(socket, &bootstrapNetIfAddr, magic, type, comm->abortFlag));
   NCCLCHECK(ncclSocketListen(socket));
   NCCLCHECK(ncclSocketGetAddr(socket, addr));
@@ -536,8 +605,8 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
 }
 
 static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
-                                   void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
-                                   void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
+                                  void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
+                                  void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
 
   int abortCounter = 0;
   do {
@@ -549,13 +618,26 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
   } while (!*sendComm || !*recvComm);
   return ncclSuccess;
 }
-static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) {
+ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) {
   NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag));
   NCCLCHECK(ncclSocketConnect(sendSocket));
   NCCLCHECK(ncclSocketInit(recvSocket));
   NCCLCHECK(ncclSocketAccept(recvSocket, listenSock));
   return ncclSuccess;
 }
+
+ncclResult_t socketRingConnectPrev(ncclSocketAddress* addr, struct ncclSocket* sendSocket, uint64_t magic, volatile uint32_t* abortFlag) {
+  NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag));
+  NCCLCHECK(ncclSocketConnect(sendSocket));
+  return ncclSuccess;
+}
+
+ncclResult_t socketRingConnectNext(struct ncclSocket* listenSock, struct ncclSocket* recvSocket) {
+  NCCLCHECK(ncclSocketInit(recvSocket));
+  NCCLCHECK(ncclSocketAccept(recvSocket, listenSock));
+  return ncclSuccess;
+}
+
 static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
                                 union ncclSocketAddress* peerAddresss,
                                 union ncclSocketAddress* peerProxy, uint64_t* peerUDS,
@@ -619,7 +701,58 @@ NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);
 
 NCCL_PARAM(RasEnable, "RAS_ENABLE", 1);
 
-ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
+// 打印联合体内容的函数
+void printNcclSocketAddress(union ncclSocketAddress *addr) {
+    if (addr == NULL) {
+        INFO(NCCL_INIT,"地址为空\n");
+        return;
+    }
+
+    // 根据地址族判断类型
+    switch (addr->sa.sa_family) {
+        case AF_INET: {
+            // IPv4 地址,使用 sin 成员
+            struct sockaddr_in *ipv4 = &addr->sin;
+            char ip_str[INET_ADDRSTRLEN];
+            
+            // 转换IP地址为字符串
+            inet_ntop(AF_INET, &(ipv4->sin_addr), ip_str, INET_ADDRSTRLEN);
+            
+            INFO(NCCL_INIT,"IPv4 地址信息:\n");
+            INFO(NCCL_INIT,"  地址族: AF_INET\n");
+            INFO(NCCL_INIT,"  IP地址: %s\n", ip_str);
+            INFO(NCCL_INIT,"  端口号: %d (网络字节序: 0x%x)\n", 
+                   ntohs(ipv4->sin_port),  // 转换为主机字节序
+                   ipv4->sin_port);
+            break;
+        }
+        case AF_INET6: {
+            // IPv6 地址,使用 sin6 成员
+            struct sockaddr_in6 *ipv6 = &addr->sin6;
+            char ip_str[INET6_ADDRSTRLEN];
+            
+            // 转换IP地址为字符串
+            inet_ntop(AF_INET6, &(ipv6->sin6_addr), ip_str, INET6_ADDRSTRLEN);
+            
+            INFO(NCCL_INIT,"IPv6 地址信息:\n");
+            INFO(NCCL_INIT,"  地址族: AF_INET6\n");
+            INFO(NCCL_INIT,"  IP地址: %s\n", ip_str);
+            INFO(NCCL_INIT,"  端口号: %d (网络字节序: 0x%x)\n", 
+                   ntohs(ipv6->sin6_port),  // 转换为主机字节序
+                   ipv6->sin6_port);
+            INFO(NCCL_INIT,"  流标签: %u\n", ntohl(ipv6->sin6_flowinfo));
+            INFO(NCCL_INIT,"  作用域ID: %u\n", ipv6->sin6_scope_id);
+            break;
+        }
+        default:
+            // 未知地址类型,打印原始信息
+            INFO(NCCL_INIT,"未知地址类型 (sa_family: %d)\n", addr->sa.sa_family);
+            //printf("  原始数据长度: %d\n", addr->sa.sa_len);
+            break;
+    }
+}
+
+ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {// 
   ncclResult_t result = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
@@ -702,7 +835,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RECV]);
   NCCLCHECK(ncclSocketInit(&sock));
   NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
-  NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer)));
+  NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer)));//很神奇,client连了server为什么还要反过来连
   NCCLCHECK(ncclSocketClose(&sock));
   NCCLCHECK(ncclSocketClose(&listenSockRoot));
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RECV]);
@@ -710,9 +843,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   // accept and connect the ring network
   if (ncclParamBootstrapNetEnable()) {
     NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
-                             &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
-                             &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
+                            &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+                            &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
   } else {
+    INFO(NCCL_INIT,"nextPeer.addr");
+    printNcclSocketAddress(&nextPeer.addr);
     NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
   }
 
@@ -759,17 +894,18 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
 
   if (ncclParamRasEnable() == 1 && performRasAddRanks) {
     if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
-      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+      INFO(NCCL_INIT|NCCL_RAS|NCCL_INIT, "Continuing in spite of a RAS initialization error");
   }
+  INFO(NCCL_INIT,"bootstrapInitNew");
 
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
   TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
-  INFO(NCCL_BOOTSTRAP | NCCL_PROFILE, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
-       timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
-       timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
-       timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
-       timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
-       timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
+  INFO(NCCL_BOOTSTRAP | NCCL_PROFILE | NCCL_INIT, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
+      timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
+      timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
+      timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
+      timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
+      timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
 exit:
   return result;
 fail:
@@ -777,6 +913,140 @@ fail:
   goto exit;
 }
 
+
+ncclResult_t bootstrapInitNew(ncclComm_t comm,bool isNewRank) {
+  ncclResult_t result = ncclSuccess;
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  struct bootstrapState *state = (bootstrapState *)comm->bootstrap;
+  struct ncclSocket *proxySocket = NULL;
+  // NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  state->cudaDev = comm->cudaDev;
+  state->abortFlag = comm->abortFlag;
+  state->net = comm->ncclNet;
+  bool performRasAddRanks = true;
+  // comm->bootstrap = state;
+  // comm->magic = state->magic = peerState->magic;
+  // newRankAddr = (union ncclSocketAddress *)malloc(sizeof(union ncclSocketAddress));
+  struct rasRankInit* rasRanks = nullptr;
+  // NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), newRankAddr, ncclSocketTypeBootstrap));
+
+  // Create the service proxy and get the UDS
+  // NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  if(!isNewRank){
+      state->nranks = comm->nRanks;
+      INFO(NCCL_INIT,"bootstrapInitNew nranks: %d", state->nranks);
+      NCCLCHECK(ncclRealloc(&state->peerProxyAddresses, nranks-1, nranks));
+      NCCLCHECKGOTO(ncclRealloc(&state->peerProxyAddressesUDS, nranks-1, nranks), result, fail);
+      NCCLCHECKGOTO(ncclRealloc(&state->peerP2pAddresses, nranks-1, nranks), result, fail);
+  }else{
+      NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+      NCCLCHECK(ncclCalloc(&proxySocket, 1));
+      NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail);
+      
+      NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail);
+      NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail);
+
+      // create a socket for others to reach out (P2P)
+      union ncclSocketAddress peerSocketAddress;
+      NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail);
+      NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail);
+      memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
+  }
+
+  // Initialize RAS
+  if (isNewRank){
+    if (ncclParamRasEnable() == 1) {
+      // The RAS thread will take care of freeing the memory allocated below.
+      NCCLCHECK(ncclCalloc(&rasRanks, nranks));
+      memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr));
+      rasRanks[rank].pid = getpid();
+      rasRanks[rank].cudaDev = comm->cudaDev;
+      rasRanks[rank].nvmlDev = comm->nvmlDev;
+      rasRanks[rank].hostHash = getHostHash();
+      rasRanks[rank].pidHash = getPidHash();
+      if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
+        INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+        // We should still participate in the ringAllInfo below as the peers will be waiting for us.
+        // Just make sure that the address is clearly invalid...
+        memset(rasRanks+rank, '\0', sizeof(*rasRanks));
+        performRasAddRanks = false;
+      }
+    }
+
+  }
+
+  NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, nullptr), result, fail);
+
+  // Create the service proxy and get the UDS
+  if (isNewRank){
+      NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+      if (ncclParamRasEnable() == 1 && performRasAddRanks) {
+        if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
+          INFO(NCCL_INIT|NCCL_RAS|NCCL_INIT, "Continuing in spite of a RAS initialization error");
+      }
+  }else{
+    comm->proxyState->listenSock->state = ncclSocketStateReady;
+    comm->proxyState->peerAddresses = state->peerProxyAddresses;
+    comm->proxyState->peerAddressesUDS = state->peerProxyAddressesUDS;
+  }
+  // NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+
+
+  // BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
+  INFO(NCCL_INIT, "rank %d nranks %d - bootstrap new DONE", rank, nranks);
+  // INFO(NCCL_BOOTSTRAP | NCCL_PROFILE | NCCL_INIT, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
+  //     timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
+  //     timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
+  //     timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
+  //     timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
+  //     timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
+
+exit:
+  return result;
+fail:
+  if (proxySocket)
+    free(proxySocket);
+  goto exit;
+}
+
+void printBinaryData(const char* prefix, const void* data, size_t size) {
+  const unsigned char *bytes = (const unsigned char *)data;
+  char buffer[8192]; // 足够大的缓冲区
+  char *ptr = buffer;
+  size_t remaining = sizeof(buffer);
+  
+  // 拼接新增的前缀字符串
+  if (prefix) {
+    int len = snprintf(ptr, remaining, "%s", prefix);
+    if (len < 0 || len >= remaining) return;
+    ptr += len;
+    remaining -= len;
+  }
+  
+  // 拼接原有的"0x"前缀
+  int len = snprintf(ptr, remaining, "0x");
+  if (len < 0 || len >= remaining) return;
+  ptr += len;
+  remaining -= len;
+  
+  // 拼接十六进制数据
+  for (size_t i = 0; i < size; i++) {
+      len = snprintf(ptr, remaining, "%02x", bytes[i]);
+      if (len < 0 || len >= remaining) return;
+      ptr += len;
+      remaining -= len;
+  }
+  
+  // 拼接后缀
+  len = snprintf(ptr, remaining, " (size: %zu bytes)\n", size);
+  if (len < 0 || len >= remaining) return;
+  
+  INFO(NCCL_INIT | NCCL_PROFILE, "%s", buffer);
+}
+
 ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
@@ -786,6 +1056,8 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   union ringConnectInfo nextPeer;
   struct ncclSocket* proxySocket = NULL;
   struct bootstrapState* state;
+  struct LhTxn* lhTxn = NULL;
+  struct LhState* lhState = NULL;
 
   NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
   state->rank = rank;
@@ -820,14 +1092,48 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   // Get addr from next rank using the parent's connections
   NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
   NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
+
+  if (rank == 0 || rank == nranks - 1) {
+    if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+      fprintf(stderr, "lighthouse: txnBegin failed");
+      ret = ncclInternalError;
+      goto fail;
+    }
+    if (txnLoad(lhTxn, &lhState) != 0) {
+      fprintf(stderr, "lighthouse: txnLoad failed");
+      ret = ncclInternalError;
+      goto fail;
+    }
+    if (rank == 0) {
+      setFirstRank(lhState, &info.addr, rank, nranks);
+      setMagic(lhState, magic);
+    }
+    else {
+      setLastRank(lhState, &info.addr, rank, nranks);
+    }
+    updateVersion(lhState);
+    printLhState(lhState);
+    if (txnSave(lhTxn, lhState) != 0) {
+      fprintf(stderr, "lighthouse: txnSave failed");
+      ret = ncclInternalError;
+      goto fail;
+    }
+    if (txnEnd(lhTxn) != 0) {
+      fprintf(stderr, "lighthouse: txnEnd failed");
+      ret = ncclInternalError;
+      goto fail;
+    }
+  }
+
   if (ncclParamBootstrapNetEnable()) {
     NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
-                                 &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
-                                 &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
+                                &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+                                &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
                   ret, fail);
   } else {
     NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
   }
+  
 
   NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
@@ -855,6 +1161,10 @@ exit:
   return ret;
 fail:
   free(proxySocket);
+  if (lhTxn)
+    free(lhTxn);
+  if (lhState)
+    free(lhState);
   goto exit;
 }
 
@@ -987,9 +1297,9 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC
   NCCLCHECKGOTO(netReg(net, sendComm, data, nranks * size, &sendDataHandle), res, exit);
   NCCLCHECKGOTO(netReg(net, recvComm, data, nranks * size, &recvDataHandle), res, exit);
   /* Simple ring based AllGather
-   * At each step i receive data from (rank-i-1) from prev
-   * and send previous step's data from (rank-i) to next
-   */
+  * At each step i receive data from (rank-i-1) from prev
+  * and send previous step's data from (rank-i) to next
+  */
   TRACE(NCCL_BOOTSTRAP, "NetRingAllGather started");
   BOOTSTRAP_PROF_OPEN(tFirst);
   for (int i = 0; i < nranks - 1; i++) {
@@ -1016,9 +1326,9 @@ static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct nccl
   ncclResult_t res = ncclSuccess;
   uint64_t tFirst = 0, tRest = 0;
   /* Simple ring based AllGather
-   * At each step i receive data from (rank-i-1) from prev
-   * and send previous step's data from (rank-i) to next
-   */
+  * At each step i receive data from (rank-i-1) from prev
+  * and send previous step's data from (rank-i) to next
+  */
   TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started");
   BOOTSTRAP_PROF_OPEN(tFirst);
   for (int i = 0; i < nranks - 1; i++) {
@@ -1063,10 +1373,10 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
   if (nranks == 1)
     return ncclSuccess;
   /* Simple [intra] process barrier
-   *
-   * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
-   * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
-   */
+  *
+  * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
+  * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
+  */
   int data[1] = {0};
   for (int mask = 1; mask < nranks; mask <<= 1) {
     int src = (rank - mask + nranks) % nranks;
@@ -1185,3 +1495,4 @@ ncclResult_t bootstrapAbort(void* commState) {
   NCCLCHECK(bootstrapClose(commState));
   return ncclSuccess;
 }
+
diff --git a/src/channel.cc b/src/channel.cc
index c2b8841..e47a00b 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -1,8 +1,8 @@
 /*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #include "channel.h"
 #include "param.h"
@@ -62,6 +62,80 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   return ncclSuccess;
 }
 
+ncclResult_t initChannelNew(struct ncclComm* comm, int channelId) {
+  struct ncclChannel* channel = &comm->channels[channelId];
+  INFO(NCCL_INIT, "start channel:channelId %d channel %p", channelId, channel);
+  channel->id = -1;
+  if (channel->id != -1) return ncclSuccess;
+
+  int nRanks = comm->nRanks;
+  int nvlsRanks = comm->localRanks;
+  INFO(NCCL_INIT, "initChannel comm %p channelId %d nRanks %d nvlsRanks %d", comm, channelId, nRanks, nvlsRanks);
+  int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */ ;
+  channel->id = channelId;
+  channel->workFifoProduced = 0;
+  
+  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  //sharedRes->deviceStream = NULL;
+  cudaStream_t deviceStream;
+  sharedRes->tpNRanks = comm->nRanks;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+  sharedRes->peers[channelId] = NULL;
+  sharedRes->devPeers[channelId] = NULL;
+  channel->peers = NULL;
+  //sharedRes->peers[channelId] == NULL;
+  //if (channel->peers == NULL) {
+    // The extra on nRanks+1 is for collnet root (i.e. network)
+    // Allocate everything related to sharedRes with ncclCalloc as this can be
+    // shared between communicators hence should not be tied to comm.
+    //if (sharedRes->peers[channelId] == NULL) {
+  INFO(NCCL_INIT,"sharedRes->peers[channelId]");
+  NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
+    //}
+    // ???????
+  channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
+  for (int r = 0; r < nRanks ; r++) {
+    channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
+    INFO(NCCL_INIT, "initChannel comm %p channelId %d rank %d peer %p", comm, channelId, r, channel->peers[r]);
+    INFO(NCCL_INIT,"comm->topParentRanks[r] %d", comm->topParentRanks[r]);
+    INFO(NCCL_INIT,"ncclAtomicRefCountIncrement : %d",channel->peers[r]->refCount);
+    (channel->peers[r]->send + 0)->transportResources = NULL;
+    (channel->peers[r]->recv + 0)->transportResources = NULL;
+    ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
+  }
+  //}
+  channel->devPeers = NULL;
+  channel->devPeersHostPtr = NULL;
+  //sharedRes->devPeers[channelId] == NULL;
+  //if (channel->devPeers == NULL) {
+  //if (sharedRes->devPeers[channelId] == NULL) {
+  INFO(NCCL_INIT,"sharedRes->devpeers[channelId]");
+  NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
+  //}
+  /* channel->devPeers is not shared, so just free it when calling commFree() */
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
+  ncclCommPushCudaFree(comm, channel->devPeers);
+  NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
+  for (int r = 0; r < nRanks ; r++) {
+    INFO(NCCL_INIT,"devhuojian");
+    uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
+    channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
+  }
+  //}
+  
+  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks );
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks , deviceStream));
+  ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+  /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+  // INFO(NCCL_INIT,"shenghli %d",channelId);
+  return ncclSuccess;
+}
+
+
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
@@ -147,8 +221,8 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
   int nPeers = nRanks + collnetNRanks + nvlsNRanks;
   /* channel peers are only valid when async init thread completes commAlloc() and
-   * the channel is initialized with initChannel(); if either is not done, this channel
-   * should never be free. */
+  * the channel is initialized with initChannel(); if either is not done, this channel
+  * should never be free. */
   if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
 
   // Free transport proxy resources
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 152739b..3d9046c 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -4,515 +4,522 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "comm.h"
-#include "device.h"
-#include "graph.h"
-#include "transport.h"
-#include "trees.h"
-#include "rings.h"
-#include "topo.h"
-
-/******************************************************************/
-/********************* Internode connection ***********************/
-/******************************************************************/
-
-ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
-  int rank = comm->rank;
-  int localRanks = comm->topo->nodes[GPU].count;
-  int nChannels = comm->nChannels;
-
-  topoRanks->nvlsHeadNum = 0;
-  for (int c=0; c<nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    channel->ring.prev = channel->ring.next = -1;
-    channel->tree.up = -1;
-    channel->collnetChain.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
-    channel->collnetDirect.out = -1;
-    channel->collnetDirect.headRank = -1;
-    channel->collnetDirect.nHeads = 0;
-    channel->collnetDirect.shift = 0;
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
-
-    int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
-    int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
-    int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
-
-    for (int i=0; i<localRanks; i++) {
-      if (ringIntra[i] == rank) {
-        topoRanks->ringRecv[c] = ringIntra[0];
-        topoRanks->ringSend[c] = ringIntra[localRanks-1];
-        topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
-        topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
-      }
-      if (treeIntra[i] == rank) {
-        int parentIndex = 0;
-        int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
-        int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
-
-        topoRanks->treeToParent[c] = treeIntra[parentIndex];
-        topoRanks->treeToChild0[c] = treeIntra[child0Index];
-        topoRanks->treeToChild1[c] = treeIntra[child1Index];
-        channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
-        channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
-      }
-      if (collNetIntra[i] == rank) {
-        channel->collnetChain.up      = i == 0 ? comm->nRanks : collNetIntra[i-1];
-        channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
-      }
-    }
-  }
-  // Duplicate channels trees
-  struct ncclChannel* channel0 = comm->channels;
-  struct ncclChannel* channel1 = channel0+nChannels;
-  memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
-
-  // Get nvls heads and the number of heads. Duplicate head is not allowed.
-  for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
-    bool addHead = true;
-    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
-
-    for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
-      if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
-        addHead = false;
-        break;
-      }
-    }
-    if (addHead) {
-      topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
-    }
-  }
-  memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
-
-  return ncclSuccess;
-}
-
-static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
-  int nChannels = comm->nChannels;
-  int nNodes = comm->nNodes;
-  for (int c=0; c<nChannels; c++) {
-    int* recv = ringRecv+c*comm->nNodes;
-    int* send = ringSend+c*comm->nNodes;
-    int* prev = ringPrev+c*comm->nRanks;
-    int* next = ringNext+c*comm->nRanks;
-    for (int n=0; n<nNodes; n++) {
-      int recvRank = recv[n];
-      int prevSendRank = send[(n-1+nNodes)%nNodes];
-      prev[recvRank] = prevSendRank;
-      int sendRank = send[n];
-      int nextRecvRank = recv[(n+1)%nNodes];
-      next[sendRank] = nextRecvRank;
-    }
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
- for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
- return ncclSuccess;
-}
-
-static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
-  if (u == -1) return ncclSuccess;
-  tree->up = indexes[u];
-  return ncclSuccess;
-}
-
-static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
-  if (d == -1) return ncclSuccess;
-  int x = 0;
-  while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
-  if (x == NCCL_MAX_TREE_ARITY) {
-    WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
-    return ncclInternalError;
-  }
-  tree->down[x] = indexes[d];
-  return ncclSuccess;
-}
-
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
-  const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-
-  // Compute tree depth. Not an exact value but a good approximation in most
-  // cases
-  int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
-
-  int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
-  int* ttp, *ttc0, *ttc1;
-  NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
-  for (int c=0; c<nChannels; c++) {
-     struct ncclChannel* channel0 = comm->channels+c;
-     struct ncclChannel* channel1 = channel0+nChannels;
-     ttp = treeToParent+c*comm->nNodes;
-     ttc0 = treeToChild0+c*comm->nNodes;
-     ttc1 = treeToChild1+c*comm->nNodes;
-     if (comm->rank == ttp[node]) {
-       NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
-       NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
+ #include "comm.h"
+ #include "device.h"
+ #include "graph.h"
+ #include "transport.h"
+ #include "trees.h"
+ #include "rings.h"
+ #include "topo.h"
+ 
+ /******************************************************************/
+ /********************* Internode connection ***********************/
+ /******************************************************************/
+ 
+ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
+   int rank = comm->rank;
+   int localRanks = comm->topo->nodes[GPU].count;
+   int nChannels = comm->nChannels;
+ 
+   topoRanks->nvlsHeadNum = 0;
+   for (int c=0; c<nChannels; c++) {
+     struct ncclChannel* channel = comm->channels+c;
+     channel->ring.prev = channel->ring.next = -1;
+     channel->tree.up = -1;
+     channel->collnetChain.up = -1;
+     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
+     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
+     channel->collnetDirect.out = -1;
+     channel->collnetDirect.headRank = -1;
+     channel->collnetDirect.nHeads = 0;
+     channel->collnetDirect.shift = 0;
+     for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
+     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
+     for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
+ 
+     int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
+     int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
+     int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
+ 
+     for (int i=0; i<localRanks; i++) {
+       if (ringIntra[i] == rank) {
+         topoRanks->ringRecv[c] = ringIntra[0];
+         topoRanks->ringSend[c] = ringIntra[localRanks-1];
+         topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
+         topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
+       }
+       if (treeIntra[i] == rank) {
+         int parentIndex = 0;
+         int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+         int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
+ 
+         topoRanks->treeToParent[c] = treeIntra[parentIndex];
+         topoRanks->treeToChild0[c] = treeIntra[child0Index];
+         topoRanks->treeToChild1[c] = treeIntra[child1Index];
+         channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
+         channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
+       }
+       if (collNetIntra[i] == rank) {
+         channel->collnetChain.up      = i == 0 ? comm->nRanks : collNetIntra[i-1];
+         channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
+       }
      }
-     if (comm->rank == ttc0[node]) {
-       NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
-       NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
+   }
+   // Duplicate channels trees
+   struct ncclChannel* channel0 = comm->channels;
+   struct ncclChannel* channel1 = channel0+nChannels;
+   memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+ 
+   // Get nvls heads and the number of heads. Duplicate head is not allowed.
+   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
+     bool addHead = true;
+     int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
+ 
+     for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
+       if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
+         addHead = false;
+         break;
+       }
      }
-     if (comm->rank == ttc1[node]) {
-       NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
-       NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
+     if (addHead) {
+       topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
      }
-     if (comm->rank == ttp[node] ||
-         comm->rank == ttc0[node] ||
-         comm->rank == ttc1[node]) {
-       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
-       INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+   }
+   memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
+ 
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
+   int nChannels = comm->nChannels;
+   int nNodes = comm->nNodes;
+   for (int c=0; c<nChannels; c++) {
+     int* recv = ringRecv+c*comm->nNodes;
+     int* send = ringSend+c*comm->nNodes;
+     int* prev = ringPrev+c*comm->nRanks;
+     int* next = ringNext+c*comm->nRanks;
+     for (int n=0; n<nNodes; n++) {
+       int recvRank = recv[n];
+       int prevSendRank = send[(n-1+nNodes)%nNodes];
+       prev[recvRank] = prevSendRank;
+       int sendRank = send[n];
+       int nextRecvRank = recv[(n+1)%nNodes];
+       next[sendRank] = nextRecvRank;
      }
-     channel0->tree.depth = channel1->tree.depth = depth;
-  }
+   }
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
+  for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
   return ncclSuccess;
-}
-
-static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
-  int rank = comm->rank;
-  int localRanks = comm->localRanks;
-  int nHeads = 0;
-  int *heads;
-  NCCLCHECK(ncclCalloc(&heads, localRanks));
-  // Find all head ranks
-  // Head index is always 0
-  for (int c=0; c<collNetGraph->nChannels; c++) {
-    int* collNetIntra = collNetGraph->intra+c*localRanks;
-    int head = collNetIntra[0];
-    for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
-    if (head != -1) heads[nHeads++] = collNetIntra[0];
-  }
-  // For all channels
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    char line[1024];
-    sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
-    int nDown = 0;
-    for (int i=0; i<nHeads; i++) {
-      if (rank == heads[i]) { // is head
-        channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
-        channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
-        int* collNetIntra = collNetGraph->intra+i*localRanks;
-        sprintf(line+strlen(line), "down ");
-        for (int r=0; r<localRanks; r++) {
-          if (collNetIntra[r] == rank) continue;
-          channel->collnetDirect.down[nDown++] = collNetIntra[r];  // connect to all peers
-          sprintf(line+strlen(line), " %d ", collNetIntra[r]);
-        }
-        sprintf(line+strlen(line), "nDown %d ", nDown);
-        break;
+ }
+ 
+ static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
+   if (u == -1) return ncclSuccess;
+   tree->up = indexes[u];
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
+   if (d == -1) return ncclSuccess;
+   int x = 0;
+   while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
+   if (x == NCCL_MAX_TREE_ARITY) {
+     WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
+     return ncclInternalError;
+   }
+   tree->down[x] = indexes[d];
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
+   const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+ 
+   // Compute tree depth. Not an exact value but a good approximation in most
+   // cases
+   int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+ 
+   int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+   int* ttp, *ttc0, *ttc1;
+   NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
+   for (int c=0; c<nChannels; c++) {
+      struct ncclChannel* channel0 = comm->channels+c;
+      struct ncclChannel* channel1 = channel0+nChannels;
+      ttp = treeToParent+c*comm->nNodes;
+      ttc0 = treeToChild0+c*comm->nNodes;
+      ttc1 = treeToChild1+c*comm->nNodes;
+      if (comm->rank == ttp[node]) {
+        NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
+        NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
       }
-    }
-    // Connect to all heads
-    int nUp = 0;
-    sprintf(line+strlen(line), "up ");
-    for (int h=0; h<nHeads; h++) {
-      if (rank == heads[h]) continue;
-      channel->collnetDirect.up[nUp++] = heads[h];
-      sprintf(line+strlen(line), " %d ", heads[h]);
-    }
-    sprintf(line+strlen(line), "heads ");
-    { // heads[] is the list of heads ordered in head order startubg with self
-      int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
-      for (int h1=0; h1 < nHeads; h1++) {
-        int h = (h0+h1)%nHeads;
-        channel->collnetDirect.heads[h1] = heads[h];
-        sprintf(line+strlen(line), " %d ", heads[h]);
+      if (comm->rank == ttc0[node]) {
+        NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
+        NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
       }
-    }
-    channel->collnetDirect.nHeads = nHeads;
-    // nHeads should always be greater than 0.
-    // coverity[divide_by_zero]
-    channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
-    channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
-    sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
-    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
-    INFO(NCCL_GRAPH, "%s", line);
-    channel->collnetChain.depth = comm->nRanks/comm->nNodes;
-  }
-  free(heads);
-  return ncclSuccess;
-}
-
-static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
-  int headRank = -1;
-  if (nHeads == 0) {
-    comm->nvlsChannels = 0;
-    return ncclSuccess;
-  }
-
-  for (int h = 0; h < nHeads; h++) {
-    if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
-  }
-
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    channel->nvls.nHeads = nHeads;
-    for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
-    for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
-    channel->nvls.down = comm->nRanks+1+headRank;
-    channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
-    channel->nvls.headRank = headRank;
-    channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-    if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
-  }
-  if (comm->nNodes == 1) return ncclSuccess;
-
-  // Connect Trees
-  int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
-  int pc0, pc1; // ignored
-  NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
-        &tree0Parent, &tree0Child0, &tree0Child1, &pc0,
-        &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
-
-  int* heads = NULL;
-  int treeUp[2] = { -1, -1 };
-  int treeDown0[2] = { -1, -1 };
-  int treeDown1[2] = { -1, -1 };
-
-  if (comm->node == 0) {
-    for (int h=0; h<nHeads; h++) {
-      char line[1024];
-      sprintf(line, "NVLS Head %2d:", h);
-      heads = nvlsHeads+h*comm->nNodes;
-      for (int n=0; n<comm->nNodes && n<20; n++) {
-        sprintf(line+strlen(line), " %2d", heads[n]);
+      if (comm->rank == ttc1[node]) {
+        NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
+        NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
       }
-      INFO(NCCL_INIT, "%s", line);
-    }
-  }
-
-  // Find the heads where I'm the head rank and retain tree up/down
-  for (int h=0; h<nHeads; h++) {
-    heads = nvlsHeads+h*comm->nNodes;
-    if (heads[comm->node] == comm->rank) {
-      treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
-      treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
-      treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
-      treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
-      treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
-      treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
-      break;
-    }
-  }
-  // Set prev/next in all channels (NVLS compute channels work
-  // orthogonally to NVLS search channels).
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    channel->nvls.treeUp = treeUp[c%2];
-    channel->nvls.treeDown[0] = channel->nvls.down;
-    int ix = 1;
-    if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
-    if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
-  }
-
-  struct ncclNvls* nvls0 = &comm->channels[0].nvls;
-  struct ncclNvls* nvls1 = &comm->channels[1].nvls;
-  INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
-      nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
-      nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
-  return ncclSuccess;
-}
-
-// Legacy naming
-NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
-NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
-// New naming
-NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
-NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
-
-int ncclMinNchannels() {
-  int minNchannels = 0;
-  if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
-  if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
-  if (minNchannels > MAXCHANNELS) {
-    INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
-    minNchannels = MAXCHANNELS;
-  }
-  if (minNchannels < 0) minNchannels = 0;
-  return minNchannels;
-}
-
-extern int64_t ncclParamWorkArgsBytes();
-
-int ncclMaxNchannels() {
-  int maxNchannels = MAXCHANNELS;
-  if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
-  if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
-  maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
-  if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
-  if (maxNchannels < 1) {
-    INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
-    maxNchannels = 1;
-  }
-  return maxNchannels;
-}
-
-static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
-  int nranks = comm->nRanks;
-  int c;
-  for (c=start; c<end; c++) {
-    memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
-    memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
-    memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
-  }
-  return c;
-}
-
-void exchangeValues(int* v0, int* v1) {
-  int tmp = *v1;
-  *v1 = *v0;
-  *v0 = tmp;
-}
-
-NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
-
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
-  // Gather data from all ranks
-  ncclResult_t ret = ncclSuccess;
-  int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
-  int nranks = comm->nRanks;
-  int nNodes = comm->nNodes;
-  int nChannels = comm->nChannels;
-  int minHeadNum = INT_MAX;
-  int shared = parent && parent->nvlsSupport  && parent->shareResources;
-  NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
-  NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
-
-  // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
-    for (int r=0; r<comm->nRanks; r++) {
-      if (comm->rankToNode[r] % 2 == 1) {
-        // Exchange rings
-        for (int c=0; c<nChannels; c+=2) {
-          exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
-          exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
-          exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
-          exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
-        }
+      if (comm->rank == ttp[node] ||
+          comm->rank == ttc0[node] ||
+          comm->rank == ttc1[node]) {
+        INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c,           channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+        INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
       }
-    }
-  }
-
-  for (int c=0; c<nChannels;c++) {
-    for (int n=0; n<nNodes; n++) {
-      int r = firstRanks[n];
-      ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
-      ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
-      treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
-      treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
-      treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
-    }
-    for (int r=0; r<nranks; r++) {
-      ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
-      ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
-    }
-  }
-
-  for (int n = 0; n < nNodes; n++) {
-    int r = firstRanks[n];
-    if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
-      minHeadNum = allTopoRanks[r]->nvlsHeadNum;
-  }
-
-  for (int c = 0; c < minHeadNum; c++) {
-    for (int n = 0; n < nNodes; n++) {
-      int r = firstRanks[n];
-      nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
-    }
-  }
-
-  // Connect rings and trees. This should also duplicate the channels.
-  NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
-  NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
-
-  // Duplicate ringPrev/ringNext for ncclBuildRing
-  memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
-  memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
-
-  // Set ring prev/next for my rank
-  for (int c=0; c<nChannels; c++) {
-    struct ncclChannel* channel0 = comm->channels+c;
-    struct ncclChannel* channel1 = channel0+nChannels;
-    channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
-    channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
-  }
-
-  // Duplication should be complete now
-  nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
-
-  // Setup CollNet
-  if (comm->config.collnetEnable) {
-    struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
-    // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-    if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
-      int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
-      nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
-    }
-    NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
-  }
-
-  // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
-     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
-  }
-
-  // Double the number of channels when using unpack networking (greater than 1 node)
-  // We won't automatically double past 16 channels, users can specify 32 if they want
-  if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
-     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
-  }
-
-  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
-  // We permit combining max, then min, to only use the first channels, then duplicate them.
-  if (comm->sharedRes->owner != comm) {
-    /* child comm #channels cannot exceed top parent #channels. */
-    nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
-    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
-  } else {
-    nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
-    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
-  }
-
-  comm->collChannels = comm->nChannels;
-#if CUDART_VERSION >= 12010
-  // Support maximal channel usage for aggregation
-  if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
-    comm->nvlsChannels = parent->nvlsResources->nChannels;
-  }
-  if (comm->nChannels < comm->nvlsChannels) {
-    nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
-  }
-  NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
-#endif
-  if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
-    nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
-    comm->collChannels = std::min(comm->collChannels, comm->nChannels);
-  }
-
-  // Create rings array and check all is fine
-  NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
-
-exit:
-  if (ringRecv) free(ringRecv);
-  if (ringSend) free(ringSend);
-  if (ringPrev) free(ringPrev);
-  if (ringNext) free(ringNext);
-  if (treeToParent) free(treeToParent);
-  if (treeToChild0) free(treeToChild0);
-  if (treeToChild1) free(treeToChild1);
-  if (nvlsHeads) free(nvlsHeads);
-  return ret;
-fail:
-  goto exit;
-}
+      channel0->tree.depth = channel1->tree.depth = depth;
+   }
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
+   int rank = comm->rank;
+   int localRanks = comm->localRanks;
+   int nHeads = 0;
+   int *heads;
+   NCCLCHECK(ncclCalloc(&heads, localRanks));
+   // Find all head ranks
+   // Head index is always 0
+   for (int c=0; c<collNetGraph->nChannels; c++) {
+     int* collNetIntra = collNetGraph->intra+c*localRanks;
+     int head = collNetIntra[0];
+     for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
+     if (head != -1) heads[nHeads++] = collNetIntra[0];
+   }
+   // For all channels
+   for (int c=0; c<comm->nChannels; c++) {
+     struct ncclChannel* channel = comm->channels+c;
+     char line[1024];
+     sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
+     int nDown = 0;
+     for (int i=0; i<nHeads; i++) {
+       if (rank == heads[i]) { // is head
+         channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
+         channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
+         int* collNetIntra = collNetGraph->intra+i*localRanks;
+         sprintf(line+strlen(line), "down ");
+         for (int r=0; r<localRanks; r++) {
+           if (collNetIntra[r] == rank) continue;
+           channel->collnetDirect.down[nDown++] = collNetIntra[r];  // connect to all peers
+           sprintf(line+strlen(line), " %d ", collNetIntra[r]);
+         }
+         sprintf(line+strlen(line), "nDown %d ", nDown);
+         break;
+       }
+     }
+     // Connect to all heads
+     int nUp = 0;
+     sprintf(line+strlen(line), "up ");
+     for (int h=0; h<nHeads; h++) {
+       if (rank == heads[h]) continue;
+       channel->collnetDirect.up[nUp++] = heads[h];
+       sprintf(line+strlen(line), " %d ", heads[h]);
+     }
+     sprintf(line+strlen(line), "heads ");
+     { // heads[] is the list of heads ordered in head order startubg with self
+       int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
+       for (int h1=0; h1 < nHeads; h1++) {
+         int h = (h0+h1)%nHeads;
+         channel->collnetDirect.heads[h1] = heads[h];
+         sprintf(line+strlen(line), " %d ", heads[h]);
+       }
+     }
+     channel->collnetDirect.nHeads = nHeads;
+     // nHeads should always be greater than 0.
+     // coverity[divide_by_zero]
+     channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
+     channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
+     sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
+     sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
+     INFO(NCCL_GRAPH, "%s", line);
+     channel->collnetChain.depth = comm->nRanks/comm->nNodes;
+   }
+   free(heads);
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
+   int headRank = -1;
+   if (nHeads == 0) {
+     comm->nvlsChannels = 0;
+     return ncclSuccess;
+   }
+ 
+   for (int h = 0; h < nHeads; h++) {
+     if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
+   }
+ 
+   for (int c=0; c<comm->nChannels; c++) {
+     struct ncclChannel* channel = comm->channels+c;
+     channel->nvls.nHeads = nHeads;
+     for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
+     for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
+     channel->nvls.down = comm->nRanks+1+headRank;
+     channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
+     channel->nvls.headRank = headRank;
+     channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
+     if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
+   }
+   if (comm->nNodes == 1) return ncclSuccess;
+ 
+   // Connect Trees
+   int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
+   int pc0, pc1; // ignored
+   NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
+         &tree0Parent, &tree0Child0, &tree0Child1, &pc0,
+         &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
+ 
+   int* heads = NULL;
+   int treeUp[2] = { -1, -1 };
+   int treeDown0[2] = { -1, -1 };
+   int treeDown1[2] = { -1, -1 };
+ 
+   if (comm->node == 0) {
+     for (int h=0; h<nHeads; h++) {
+       char line[1024];
+       sprintf(line, "NVLS Head %2d:", h);
+       heads = nvlsHeads+h*comm->nNodes;
+       for (int n=0; n<comm->nNodes && n<20; n++) {
+         sprintf(line+strlen(line), " %2d", heads[n]);
+       }
+       INFO(NCCL_INIT, "%s", line);
+     }
+   }
+ 
+   // Find the heads where I'm the head rank and retain tree up/down
+   for (int h=0; h<nHeads; h++) {
+     heads = nvlsHeads+h*comm->nNodes;
+     if (heads[comm->node] == comm->rank) {
+       treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
+       treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
+       treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
+       treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
+       treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
+       treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
+       break;
+     }
+   }
+   // Set prev/next in all channels (NVLS compute channels work
+   // orthogonally to NVLS search channels).
+   for (int c=0; c<comm->nChannels; c++) {
+     struct ncclChannel* channel = comm->channels+c;
+     channel->nvls.treeUp = treeUp[c%2];
+     channel->nvls.treeDown[0] = channel->nvls.down;
+     int ix = 1;
+     if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
+     if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
+   }
+ 
+   struct ncclNvls* nvls0 = &comm->channels[0].nvls;
+   struct ncclNvls* nvls1 = &comm->channels[1].nvls;
+   INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
+       nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
+       nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
+   return ncclSuccess;
+ }
+ 
+ // Legacy naming
+ NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+ NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+ // New naming
+ NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
+ NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+ 
+ int ncclMinNchannels() {
+   int minNchannels = 0;
+   if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
+   if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
+   if (minNchannels > MAXCHANNELS) {
+     INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
+     minNchannels = MAXCHANNELS;
+   }
+   if (minNchannels < 0) minNchannels = 0;
+   return minNchannels;
+ }
+ 
+ extern int64_t ncclParamWorkArgsBytes();
+ 
+ int ncclMaxNchannels() {
+   int maxNchannels = MAXCHANNELS;
+   if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
+   if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+   maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
+   if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
+   if (maxNchannels < 1) {
+     INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
+     maxNchannels = 1;
+   }
+   return maxNchannels;
+ }
+ 
+ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
+   int nranks = comm->nRanks;
+   int c;
+   for (c=start; c<end; c++) {
+     memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
+     memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
+     memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
+   }
+   return c;
+ }
+ 
+ void exchangeValues(int* v0, int* v1) {
+   int tmp = *v1;
+   *v1 = *v0;
+   *v0 = tmp;
+ }
+ 
+ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
+ 
+ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
+   // Gather data from all ranks
+   ncclResult_t ret = ncclSuccess;
+   int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
+   int nranks = comm->nRanks;
+   int nNodes = comm->nNodes;
+   int nChannels = comm->nChannels;
+   int minHeadNum = INT_MAX;
+   int shared = parent && parent->nvlsSupport  && parent->shareResources;
+   NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
+   NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
+ 
+   // Alternate rings to avoid crossing rails
+   if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
+     for (int r=0; r<comm->nRanks; r++) {
+       if (comm->rankToNode[r] % 2 == 1) {
+         // Exchange rings
+         for (int c=0; c<nChannels; c+=2) {
+           exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
+           exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
+           exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
+           exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
+         }
+       }
+     }
+   }
+ 
+   for (int c=0; c<nChannels;c++) {
+     for (int n=0; n<nNodes; n++) {
+       int r = firstRanks[n];
+       ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
+       ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
+       treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
+       treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
+       treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
+     }
+     for (int r=0; r<nranks; r++) {
+       ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
+       ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
+     }
+   }
+ 
+   for (int n = 0; n < nNodes; n++) {
+     int r = firstRanks[n];
+     if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
+       minHeadNum = allTopoRanks[r]->nvlsHeadNum;
+   }
+ 
+   for (int c = 0; c < minHeadNum; c++) {
+     for (int n = 0; n < nNodes; n++) {
+       int r = firstRanks[n];
+       nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
+     }
+   }
+ 
+   // Connect rings and trees. This should also duplicate the channels.
+   NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
+   NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
+ 
+   // Duplicate ringPrev/ringNext for ncclBuildRing
+   memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
+   memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
+ 
+   // Set ring prev/next for my rank
+   for (int c=0; c<nChannels; c++) {
+     struct ncclChannel* channel0 = comm->channels+c;
+     struct ncclChannel* channel1 = channel0+nChannels;
+     channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
+     channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
+   }
+ 
+   // Duplication should be complete now
+   nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+ 
+   // Setup CollNet
+   if (comm->config.collnetEnable) {
+     struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
+     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
+     if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
+       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
+       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
+     }
+     NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+   }
+ 
+   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
+   if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
+      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+   }
+ 
+   // Double the number of channels when using unpack networking (greater than 1 node)
+   // We won't automatically double past 16 channels, users can specify 32 if they want
+   if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+   }
+ 
+   // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
+   // We permit combining max, then min, to only use the first channels, then duplicate them.
+   if (comm->sharedRes->owner != comm) {
+     /* child comm #channels cannot exceed top parent #channels. */
+     nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
+     nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
+   } else {
+     nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
+     nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
+   }
+ 
+   comm->collChannels = comm->nChannels;
+ #if CUDART_VERSION >= 12010
+   // Support maximal channel usage for aggregation
+   if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
+     comm->nvlsChannels = parent->nvlsResources->nChannels;
+   }
+   if (comm->nChannels < comm->nvlsChannels) {
+     nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
+   }
+   NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
+ #endif
+   if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
+     nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
+     comm->collChannels = std::min(comm->collChannels, comm->nChannels);
+   }
+   NCCLCHECKGOTO(ncclCalloc(&comm->ringPrev, comm->nRanks * MAXCHANNELS), ret, fail);
+   NCCLCHECKGOTO(ncclCalloc(&comm->ringNext, comm->nRanks * MAXCHANNELS), ret, fail);
+   for (int j = 0; j < nranks * MAXCHANNELS; j++) {
+      comm->ringPrev[j] = ringPrev[j];
+      comm->ringNext[j] = ringNext[j];
+      //INFO(NCCL_INIT,"j:%d,ringPrev:%d,ringNext:%d",j,ringPrev[j],ringNext[j]);
+   }
+   // Create rings array and check all is fine
+   NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
+ 
+ exit:
+   if (ringRecv) free(ringRecv);
+   if (ringSend) free(ringSend);
+   if (ringPrev) free(ringPrev);
+   if (ringNext) free(ringNext);
+   if (treeToParent) free(treeToParent);
+   if (treeToChild0) free(treeToChild0);
+   if (treeToChild1) free(treeToChild1);
+   if (nvlsHeads) free(nvlsHeads);
+   return ret;
+ fail:
+   goto exit;
+ }
+ 
\ No newline at end of file
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 85e33f6..d171370 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,8 +1,8 @@
 /*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #ifndef NCCL_BOOTSTRAP_H_
 #define NCCL_BOOTSTRAP_H_
@@ -16,10 +16,69 @@ struct ncclBootstrapHandle {
 };
 static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
 
+// extern union ncclSocketAddress bootstrapNetIfAddr;
+
+struct unexConn {
+  int peer;
+  int tag;
+  struct ncclSocket sock;
+  struct unexConn *next;
+};
+
+struct bootstrapRing_t {
+  union {
+    struct {
+      void *sendComm, *recvComm;
+      ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
+    } net;
+    struct {
+      struct ncclSocket recv;
+      struct ncclSocket send;
+    } socket;
+  };
+};
+struct bootstrapListen_t {
+  struct ncclSocket peerSocket; // socket for peers to contact me in P2P
+  union {
+    struct {
+      int dev;
+      void *comm;
+      char handle[NCCL_NET_HANDLE_MAXSIZE];
+    } net;
+    struct ncclSocket socket; // socket to be used for the ring
+  };
+};
+
+struct bootstrapState {
+  struct bootstrapRing_t ring;
+  struct bootstrapListen_t listen;
+  ncclNet_t *net;
+  uint64_t *peerProxyAddressesUDS;
+  union ncclSocketAddress *peerProxyAddresses;
+  union ncclSocketAddress *peerP2pAddresses;
+  struct unexConn *unexpectedConnections;
+  int cudaDev;
+  int rank;
+  int nranks;
+  uint64_t magic;
+  volatile uint32_t *abortFlag;
+};
+
+// typedef struct {
+//   int socket_fd;
+//   int rank_id;
+//   char ip_address[INET_ADDRSTRLEN];
+// } RankConnection;
+#define STATE_RING(s, f) (s->ring.f)
+#define STATE_LISTEN(s, f) (s->listen.f)
+ncclResult_t socketRingConnectPrev(ncclSocketAddress* addr, struct ncclSocket* sendSocket, uint64_t magic, volatile uint32_t* abortFlag);
+ncclResult_t socketRingConnectNext(struct ncclSocket* listenSock, struct ncclSocket* recvSocket);
+ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag);
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
 ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
+ncclResult_t bootstrapInitNew(ncclComm_t comm,bool isNewRank);
 ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
@@ -31,4 +90,11 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
 ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
+ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
+                                      ncclSocketType type);
+                                     
+
+void printBinaryData(const char* prefix, const void* data, size_t size);
+
+
 #endif
diff --git a/src/include/channel.h b/src/include/channel.h
index ee9aa6d..21fe34b 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t initChannelNew(struct ncclComm* comm, int channelid);
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
diff --git a/src/include/comm.h b/src/include/comm.h
index 1378e07..f2a3809 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,8 +1,8 @@
 /*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_
@@ -417,8 +417,44 @@ typedef enum ncclGroupTaskType {
   ncclGroupTaskTypeNum = 2,
 } ncclGroupTaskType_t;
 
+struct ncclCommTrans {
+  struct ncclTopoRanks* peerTopo;//长度nRanks
+  struct ncclPeerInfo* peerInfo;
+  int* nodesFirstRank;//长度nRanks
+  int* nodesTreePatterns;//长度nRanks
+  int* ringPrev;//长度nRanks*MAXCHANNELS
+  int* ringNext;//长度nRanks*MAXCHANNELS
+  int* peerRings;//长度nRanks*MAXCHANNELS
+  void* bootstrap;
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; 
+  int cpuArch;
+  int cpuVendor;
+  int nNodes;
+  uint64_t commHash;
+  int* rankToNode;
+  struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+};
+
 struct ncclComm {
   uint64_t startMagic;
+  // struct ncclTopoRanks peerTopo[8];
+  // int nodesFirstRank[16];
+  // int nodesTreePatterns[16];
+  // int ringPrev[9 * MAXCHANNELS];
+  // int ringNext[9 * MAXCHANNELS];
+  // //char peersXml[TOTAL_SIZE];
+  // int peerRings[9 * MAXCHANNELS];
+  struct ncclTopoRanks* peerTopo;//长度nRanks
+  int* nodesFirstRank;//长度nRanks
+  int* nodesTreePatterns;//长度nRanks
+  int* ringPrev;//长度nRanks*MAXCHANNELS
+  int* ringNext;//长度nRanks*MAXCHANNELS
+  int* peerRings;//长度nRanks*MAXCHANNELS
+  // void* bootstrap;
+  // int nRanks;  // number of GPUs in communicator
+  // int cudaDev; 
+  // int* rankToNode;
   struct ncclMemoryStack memPermanent, memScoped;
   // List of destructors to run when comm is destructed
   struct ncclDestructor* destructorHead;
@@ -515,7 +551,7 @@ struct ncclComm {
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   /* This attribute can indicate the states of communicators and return code of
-   * asynchronous NCCL operations. */
+  * asynchronous NCCL operations. */
   ncclResult_t asyncResult;
 
   // Flag to ask NCCL kernels to abort
diff --git a/src/include/lighthouse.h b/src/include/lighthouse.h
new file mode 100644
index 0000000..bb31fc9
--- /dev/null
+++ b/src/include/lighthouse.h
@@ -0,0 +1,34 @@
+#ifndef LIGHTHOUSE_H_
+#define LIGHTHOUSE_H_
+
+#include "socket.h"
+
+#include <stdint.h>
+
+#define LH_STATE_PATH "/tmp/lighthouse_state"
+
+struct LhTxn;
+struct LhState;
+
+int txnWaitForVersion(const char* path, uint64_t expected_version, int timeout_ms);
+int txnBegin(const char* path, int write, struct LhTxn** out);
+int txnLoad(struct LhTxn* txn, struct LhState** out);
+int txnSave(struct LhTxn* txn, const struct LhState* state);
+int txnEnd(struct LhTxn* txn);
+
+int initialize(struct LhState* state, const union ncclSocketAddress* src_addrs, int nranks, uint64_t magic);
+void setMagic(struct LhState* state, uint64_t magic);
+int setFirstRank(struct LhState* state, const union ncclSocketAddress* firstRankNcclAddr, uint32_t rank, uint32_t nranks);
+int setLastRank(struct LhState* state, const union ncclSocketAddress* lastRankNcclAddr, uint32_t rank, uint32_t nranks);
+int setNewRank(struct LhState* state, const union ncclSocketAddress* newRankNcclAddr, uint32_t rank);
+void updateLastRankAddr(struct LhState* state);
+void updateVersion(struct LhState* state);
+
+void getMagic(const struct LhState* state, uint64_t* magic);
+void getVersion(const struct LhState* state, uint64_t* version);
+int queryNextRankAddrNew(const struct LhState* state, union ncclSocketAddress* nextAddr);
+int queryNextRankAddrLast(const struct LhState* state, union ncclSocketAddress* nextAddr);
+
+void printLhState(const struct LhState* state);
+
+#endif // LIGHTHOUSE_H_
\ No newline at end of file
diff --git a/src/include/scale.h b/src/include/scale.h
new file mode 100644
index 0000000..e6665b6
--- /dev/null
+++ b/src/include/scale.h
@@ -0,0 +1,55 @@
+#ifndef NCCL_SCALE_H_
+#define NCCL_SCALE_H_
+
+#include "core.h"
+#include <cstddef>
+#include "nccl.h"
+#include "socket.h"
+#include "bootstrap.h"
+
+#define ADDR_LIST_LEN (512)
+
+#define TIMER_INIT_TOTAL 0
+#define TIMER_INIT_KERNELS 1
+#define TIMER_INIT_BOOTSTRAP 2
+#define TIMER_INIT_ALLGATHER 3
+#define TIMER_INIT_TOPO 4
+#define TIMER_INIT_GRAPHS 5
+#define TIMER_INIT_CONNECT 6
+#define TIMER_INIT_ALLOC 7
+#define TIMERS_INIT_COUNT 8
+
+typedef ncclComm_t ncclCommIncomplete_t;
+
+struct ncclNewRankInfoInternal {
+  ncclCommIncomplete_t comm;
+};
+
+struct ncclCommTransUniqueIdInfo {
+  ncclCommIncomplete_t comm;
+  ncclUniqueId *uniqueId;
+};
+
+// in `init.cc`
+ncclResult_t ncclInit();
+// in `init.cc`
+ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config);
+// in `init.cc`
+int64_t ncclParamSetStackSize();
+// in `init.cc`
+ncclResult_t commAlloc(struct ncclComm *comm, struct ncclComm *parent, int ndev, int rank);
+// in `init.cc`
+ncclResult_t commAllocNew(struct ncclComm *comm, struct ncclComm *parent, int ndev, int rank);
+// in `init.cc`
+uint64_t hashUniqueId(ncclUniqueId const &id);
+// in `init.cc`
+ncclResult_t initTransportsNewRank(struct ncclComm* comm, const struct ncclCommTrans* peerComm);
+ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]);
+// in 'init.cc'
+ncclResult_t updateTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]);
+// in `reinit.cc`
+ncclResult_t devCommResetup(ncclComm_t comm);
+// in `init.cc`
+ncclResult_t devCommSetup(ncclComm_t comm);
+
+#endif // NCCL_SCALE_H_
\ No newline at end of file
diff --git a/src/include/serialize.h b/src/include/serialize.h
new file mode 100644
index 0000000..e56e2ea
--- /dev/null
+++ b/src/include/serialize.h
@@ -0,0 +1,507 @@
+#ifndef NCCL_SERIALIZE_H_
+#define NCCL_SERIALIZE_H_
+
+#include "bootstrap.h"
+#include "scale.h"
+#include "transport.h"
+#include "graph/topo.h"  // 确保 ncclTopoGraph 定义在此头文件中
+
+
+#define SERIAL_CONVERT(DST, SRC) reinterpret_cast<decltype(DST)>(SRC)
+#define SERIAL_ASSIGN(DST, SRC) ((DST) = SERIAL_CONVERT(DST, SRC))
+
+// // 1. 核心修改:ncclCommTrans 结构体新增 cpuArch、cpuVendor、nNodes 字段
+// struct ncclCommTrans {
+//   struct ncclTopoRanks* peerTopo;      // 长度 nRanks(节点拓扑排序信息)
+//   struct ncclPeerInfo* peerInfo;       // 长度 nRanks+1(节点间通信信息)
+//   int* nodesFirstRank;                 // 长度 nRanks(每个节点的首个_rank)
+//   int* nodesTreePatterns;              // 长度 nRanks(节点树通信模式)
+//   int* ringPrev;                       // 长度 nRanks*MAXCHANNELS(环形通信前序节点)
+//   int* ringNext;                       // 长度 nRanks*MAXCHANNELS(环形通信后序节点)
+//   int* peerRings;                      // 长度 nRanks*MAXCHANNELS(节点间环形映射)
+//   void* bootstrap;                     // 指向 bootstrapState(通信初始化状态)
+//   int nRanks;                          // 通信器中 GPU 总数(动态数组长度依据)
+//   // 新增三个基础字段(CPU 架构、厂商、节点数量,均为 int 类型)
+//   int cpuArch;                         // CPU 架构标识(如 x86_64=62、ARM=123 等)
+//   int cpuVendor;                       // CPU 厂商标识(如 Intel=1、AMD=2、ARM=3 等)
+//   int nNodes;                          // 通信集群中的节点总数
+//   int cudaDev;                         // 当前设备关联的 CUDA 设备号
+//   int* rankToNode;                     // 长度 nRanks(rank 到节点的映射表)
+//   // 此前新增的固定大小拓扑图数组(无指针成员)
+//   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
+// 原 bootstrapState 结构体定义(注释保留,供参考)
+// struct bootstrapState {
+//   struct bootstrapRing_t ring;
+//   struct bootstrapListen_t listen;
+//   ncclNet_t *net;
+//   uint64_t *peerProxyAddressesUDS;
+//   union ncclSocketAddress *peerProxyAddresses;
+//   union ncclSocketAddress *peerP2pAddresses;
+//   struct unexConn *unexpectedConnections;
+//   int cudaDev;
+//   int rank;
+//   int nranks;
+//   uint64_t magic;
+//   volatile uint32_t *abortFlag;
+// };
+
+// 基础序列化模板(适用于无指针的简单结构体,直接内存拷贝)
+template <typename info_t>
+inline size_t ncclInfoSerializeBase(char* buffer, const info_t* info) {
+  memcpy(buffer, info, sizeof(info_t));
+  return sizeof(info_t);
+}
+
+// 通用序列化模板(默认调用基础模板,复杂类型需特化)
+template <typename info_t>
+inline size_t ncclInfoSerialize(char *buffer, const info_t *info) {
+  return ncclInfoSerializeBase(buffer, info);
+}
+
+// 基础反序列化模板(仅返回结构体大小,复杂类型需特化解析逻辑)
+template <typename info_t>
+inline size_t ncclInfoDeserializeBase(info_t *info) {
+  return sizeof(info_t);
+}
+
+// 通用反序列化模板(默认调用基础模板)
+template <typename info_t>
+inline size_t ncclInfoDeserialize(info_t *info) {
+  //printf("ncclInfoDeserialize info->peerP2pAddresses 原始数据:");
+  return ncclInfoDeserializeBase(info);
+}
+
+// 序列化大小计算模板声明(需为特定类型特化实现)
+template <typename info_t>
+inline size_t ncclInfoSerializeSize(const info_t* info);
+
+
+// ------------------------------
+// bootstrapState 序列化特化(原逻辑完全保留,无修改)
+// ------------------------------
+template <>
+inline size_t ncclInfoSerializeSize(const struct bootstrapState *info) {
+  size_t offset = 0;
+  // int n = 2;
+  offset += sizeof(struct bootstrapState);  // 基础字段大小
+  offset += sizeof(uint64_t) * info->nranks;  // peerProxyAddressesUDS 数组
+  offset += sizeof(union ncclSocketAddress) * info->nranks;  // peerProxyAddresses 数组
+  offset += sizeof(union ncclSocketAddress) * info->nranks;  // peerP2pAddresses 数组
+  return offset;
+}
+
+template <typename T>
+inline size_t ncclArraySerialize(char *buffer, const T *arr, size_t n) {
+  size_t offset = 0;
+  for (size_t i = 0; i < n; i++) {
+    offset += ncclInfoSerialize(buffer + offset, arr + i);
+  }
+  //printBinaryData("ncclInfoSerialize buffer bootstrapState0:", buffer, sizeof(ncclSocketAddress));
+
+  return offset;
+}
+
+template <typename T>
+inline size_t ncclArrayDeserialize(T *arr, size_t n) {
+  size_t offset = 0;
+  char *buffer = (char *)arr;
+  for (size_t i = 0; i < n; i++) {
+    offset += ncclInfoDeserialize((T *)(buffer + offset));
+  }
+  return offset;
+}
+template <>
+inline size_t ncclInfoSerialize(char *buffer, const struct bootstrapState *info) {
+  size_t offset = 0;
+  // 1. 序列化基础字段
+  offset += ncclInfoSerializeBase(buffer + offset, info);
+  // //printBinaryData("ncclInfoSeserialize info->bootstrap内部", buffer , sizeof(struct bootstrapState));
+  // 2. 序列化动态数组(非空时才拷贝,避免无效操作)
+  if (info->peerProxyAddressesUDS != nullptr) {
+    const size_t size = sizeof(uint64_t) * info->nranks;
+    memcpy(buffer + offset, info->peerProxyAddressesUDS, size);
+    //printBinaryData("ncclInfoSerialize info->peerProxyAddressesUDS", buffer + offset, size);
+    offset += size;
+  }
+  //printBinaryData("ncclInfoSeserialize info->bootstrap内部", buffer , sizeof(struct bootstrapState)+sizeof(uint64_t) * info->nranks);
+  if (info->peerProxyAddresses != nullptr) {
+    const size_t size = sizeof(union ncclSocketAddress) * info->nranks;
+    memcpy(buffer + offset, info->peerProxyAddresses, size);
+    //printBinaryData("ncclInfoSeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+    offset += size;
+  }
+  if (info->peerP2pAddresses != nullptr) {
+    // //printf
+    const size_t size = sizeof(union ncclSocketAddress) * info->nranks;
+    memcpy(buffer + offset, info->peerP2pAddresses, size);
+    // //printBinaryData("ncclInfoSerialize info->peerP2pAddresses xinxi", buffer + offset, 1000);
+    // //printBinaryData("ncclInfoSerialize info->peerP2pAddresses 原有的", info->peerP2pAddresses, 1000);
+    offset += size;
+  }
+  return offset;
+}
+
+// template <>
+// inline size_t ncclInfoSerialize(char *buffer, const struct bootstrapState *info) {
+//   size_t offset = 0;
+//   offset += ncclInfoSerializeBase(buffer + offset, info);
+//   // //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[2], sizeof(ncclSocketAddress));
+//   //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[1], sizeof(ncclSocketAddress));
+//   //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+  
+//   offset += ncclArraySerialize(buffer + offset, info->peerProxyAddressesUDS, info->nranks);
+//   offset += ncclArraySerialize(buffer + offset, info->peerProxyAddresses, info->nranks);
+//   size_t offset1 = offset;
+//   offset += ncclArraySerialize(buffer + offset, info->peerP2pAddresses, info->nranks);
+//     //printBinaryData("ncclInfoSerialize bootstrapState buffer:", buffer + offset1, sizeof(ncclSocketAddress));
+
+//   return offset;
+// }
+
+
+// ------------------------------
+// ncclComm 序列化特化(核心修改:新增三个 int 字段的大小计算与数据拷贝)
+// ------------------------------
+template <>
+inline size_t ncclInfoSerializeSize(const ncclComm* info) {
+  size_t total = 0;
+  total += sizeof(ncclCommTrans);
+  // // 1. 基础字段大小计算(核心修改:加入 cpuArch、cpuVendor、nNodes 的 3*sizeof(int))
+  // total += sizeof(info->nRanks)          // 1.1 GPU 总数(动态数组长度依据)
+  //        + sizeof(info->cpuArch)         // 1.2 新增:CPU 架构标识
+  //        + sizeof(info->cpuVendor)       // 1.3 新增:CPU 厂商标识
+  //        + sizeof(info->nNodes)          // 1.4 新增:节点总数
+  //        + sizeof(info->cudaDev)         // 1.5 CUDA 设备号
+  //        + sizeof(info->graphs);         // 1.6 固定大小拓扑图数组
+
+  // 2. 动态数组字段大小(仅 nRanks>0 时计算,原逻辑保留)
+  if (info->nRanks > 0) {
+    // peerRings:nRanks * MAXCHANNELS 个 int
+    total += info->nRanks * MAXCHANNELS * sizeof(int);
+
+    // peerTopo:nRanks 个 ncclTopoRanks 结构体
+    // if (info->peerTopo != nullptr) {
+      total += info->nRanks * sizeof(ncclTopoRanks);
+    // }
+
+    // nodesFirstRank:nRanks 个 int
+    // if (info->nodesFirstRank != nullptr) {
+      total += info->nRanks * sizeof(int);
+    // }
+
+    // nodesTreePatterns:nRanks 个 int
+    // if (info->nodesTreePatterns != nullptr) {
+      total += info->nRanks * sizeof(int);
+    // }
+
+    // ringPrev/ringNext:nRanks*MAXCHANNELS 个 int
+    // if (info->ringPrev != nullptr) {
+      total += info->nRanks * MAXCHANNELS * sizeof(int);
+    // }
+    // if (info->ringNext != nullptr) {
+      total += info->nRanks * MAXCHANNELS * sizeof(int);
+    // }
+
+    // rankToNode:nRanks 个 int
+    // if (info->rankToNode != nullptr) {
+      total += info->nRanks * sizeof(int);
+    //}
+  }
+
+  // 3. peerInfo 大小(nRanks+1 个 ncclPeerInfo 结构体,原逻辑保留)
+  // if (info->peerInfo != nullptr && info->nRanks > 0) {
+    total += (info->nRanks + 1) * sizeof(ncclPeerInfo);
+  //}
+  //printf("ncclInfoSerializeSize info->nRanks: %d total: %d\n", info->nRanks,total);
+
+  // 4. bootstrapState 大小(复用已有计算逻辑,原逻辑保留)
+  if (info->bootstrap != nullptr) {
+    total += ncclInfoSerializeSize(static_cast<const bootstrapState*>(info->bootstrap));
+  }
+  //printf("ncclInfoSerializeSize info->nRanks88888: %d total: %d\n", info->nRanks,total);
+
+  return total;
+}
+
+template <>
+inline size_t ncclInfoSerialize(char* buffer, const ncclComm* info) {
+  size_t offset = 0;
+  const size_t totalSize = ncclInfoSerializeSize(info);  // 提前计算总大小,用于校验
+  ncclCommTrans *commTrans = (ncclCommTrans *)buffer;
+  offset += sizeof(ncclCommTrans);
+
+  // 1. 序列化基础字段(核心修改:按顺序拷贝新增的三个 int 字段)
+  // 1.1 先拷贝 nRanks(后续动态数组解析依赖此值)
+  memcpy(&commTrans->nRanks, &info->nRanks, sizeof(info->nRanks));
+  // 1.2 拷贝新增的 cpuArch(CPU 架构)
+  memcpy(&commTrans->cpuArch, &info->cpuArch, sizeof(info->cpuArch));
+  // 1.3 拷贝新增的 cpuVendor(CPU 厂商)
+  memcpy(&commTrans->cpuVendor, &info->cpuVendor, sizeof(info->cpuVendor));
+  // 1.4 拷贝新增的 nNodes(节点总数)
+  memcpy(&commTrans->nNodes, &info->nNodes, sizeof(info->nNodes));
+  // 1.5 拷贝 cudaDev(CUDA 设备号)
+  memcpy(&commTrans->cudaDev, &info->cudaDev, sizeof(info->cudaDev));
+  // 1.6 拷贝固定大小的 graphs 数组(无指针,直接内存拷贝)
+  memcpy(&commTrans->graphs, info->graphs, sizeof(info->graphs));
+  memcpy(&commTrans->commHash, &info->commHash, sizeof(info->commHash));
+  commTrans->peerTopo = info->peerTopo;
+  commTrans->peerInfo = info->peerInfo;
+  commTrans->peerRings = info->peerRings;
+  commTrans->bootstrap = info->bootstrap;
+  commTrans->rankToNode = info->rankToNode;
+  commTrans->nodesFirstRank = info->nodesFirstRank;
+  commTrans->nodesTreePatterns = info->nodesTreePatterns;
+  commTrans->ringPrev = info->ringPrev;
+  commTrans->ringNext = info->ringNext;
+
+  // 无 GPU 时直接返回(避免后续无效操作)
+  if (info->nRanks == 0) {
+    assert(offset == totalSize);
+    return offset;
+  }
+  size_t size = info->nRanks * MAXCHANNELS * sizeof(int);
+  // 2. 序列化动态数组字段(原逻辑完全保留,无修改)
+  if (info->peerRings != nullptr) {
+    memcpy(buffer + offset, info->peerRings, size);
+  }
+  //printBinaryData("ncclInfoSerialize info->peerRings", buffer + offset, size);
+  offset += size;
+  size = info->nRanks * sizeof(ncclTopoRanks);
+  if (info->peerTopo != nullptr) {
+    memcpy(buffer + offset, info->peerTopo, size);
+  }
+  //printBinaryData("ncclInfoSerialize info->peerTopo", buffer + offset, 20);
+  offset += size;
+  size = info->nRanks * sizeof(int);
+  if (info->nodesFirstRank != nullptr) {
+    memcpy(buffer + offset, info->nodesFirstRank, size);
+  }
+  // ?????
+  //printf("ncclInfoSerialize info->nodesFirstRank: %d\n", info->nodesFirstRank[1]);
+  //printBinaryData("ncclInfoSerialize info->nodesFirstRank", buffer + offset, size);
+  // //printf("ncclInfoSerialize info->nodesTreePatterns: %d\n", info->nodesTreePatterns[1]);
+  offset += size;
+  size = info->nRanks * sizeof(int);
+  if (info->nodesTreePatterns != nullptr) {
+    memcpy(buffer + offset, info->nodesTreePatterns, size);
+  }
+  offset += size;
+  size = info->nRanks * MAXCHANNELS * sizeof(int);
+  if (info->ringPrev != nullptr) {
+    memcpy(buffer + offset, info->ringPrev, size);
+  }
+  offset += size;
+  size = info->nRanks * MAXCHANNELS * sizeof(int);
+  if (info->ringNext != nullptr) {
+    memcpy(buffer + offset, info->ringNext, size);
+  }
+  offset += size;
+  size = info->nRanks * sizeof(int);
+  if (info->rankToNode != nullptr) {
+    memcpy(buffer + offset, info->rankToNode, size);
+  }
+  // ?????
+  //printf("ncclInfoSerialize info->rankToNode: %d\n", info->rankToNode[1]);
+  //printBinaryData("ncclInfoSerialize info->rankToNode", buffer + offset, size);
+  offset += size;
+  size = (info->nRanks + 1) * sizeof(ncclPeerInfo);
+  // 3. 序列化 peerInfo(原逻辑保留)
+  if (info->peerInfo != nullptr) {
+    memcpy(buffer + offset, info->peerInfo, size);
+  }
+  offset += size;
+  // //printf("ncclInfoSerialize info->nRanks: %d total: %d\n", info->nRanks,offset);
+  // //printf("ncclS offset: %d\n", offset);
+
+  // 4. 序列化 bootstrapState(原逻辑保留)
+  if (info->bootstrap != nullptr) {
+    //printBinaryData("ncclInfoSerialize info->bootstrap", info->bootstrap, 600);
+    offset += ncclInfoSerialize(buffer + offset, static_cast<const bootstrapState*>(info->bootstrap));
+  }
+
+  // 校验序列化大小是否匹配(避免字段遗漏或冗余)
+  // assert(offset == totalSize);
+  return offset;
+}
+
+
+// ------------------------------
+// 反序列化特化(核心修改:新增三个 int 字段的解析逻辑)
+// ------------------------------
+// bootstrapState 反序列化(原逻辑完全保留,无修改)
+template <>
+inline size_t ncclInfoDeserialize(struct bootstrapState *info) {
+  size_t offset = 0;
+  char *buffer = (char *)info;
+  //printBinaryData("ncclInfoDeserialize info->bootstrap内部", buffer + offset, sizeof(struct bootstrapState)+sizeof(uint64_t) * info->nranks);
+
+  // 1. 反序列化基础字段
+  offset += ncclInfoDeserializeBase(info);
+  
+  // 2. 反序列化动态数组(指针指向缓冲区对应位置,不重新分配内存)
+  if (info->peerProxyAddressesUDS != nullptr) {
+    //SERIAL_ASSIGN(info->peerProxyAddressesUDS, buffer + offset);
+    info->peerProxyAddressesUDS = reinterpret_cast<decltype(info->peerProxyAddressesUDS)>(buffer + offset);
+    //printBinaryData("ncclInfoDeserialize info->peerProxyAddressesUDS", buffer + offset, sizeof(uint64_t) * info->nranks);
+    offset += sizeof(uint64_t) * info->nranks;
+  }
+  if (info->peerProxyAddresses != nullptr) {
+    //printBinaryData("ncclInfoDeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+    //SERIAL_ASSIGN(info->peerProxyAddresses, buffer + offset);
+    info->peerProxyAddresses = reinterpret_cast<decltype(info->peerProxyAddresses)>(buffer + offset);
+    // //printBinaryData("ncclInfoDeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+    offset += sizeof(union ncclSocketAddress) * info->nranks;
+  }
+  if (info->peerP2pAddresses != nullptr) {
+    // //printf
+    // //printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+    info->peerP2pAddresses = reinterpret_cast<decltype(info->peerP2pAddresses)>(buffer + offset);
+    // SERIAL_ASSIGN(info->peerP2pAddresses, buffer + offset);
+    offset += sizeof(union ncclSocketAddress) * info->nranks;
+  }
+
+  return offset;
+}
+
+// template <>
+// inline size_t ncclInfoDeserialize(struct bootstrapState *info) {
+//   size_t offset = 0;
+//   char *buffer = (char *)info;
+//   offset += ncclInfoDeserializeBase(info);
+//   offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerProxyAddressesUDS, buffer + offset), info->nranks);
+//   offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerProxyAddresses, buffer + offset), info->nranks);
+//   //printBinaryData("ncclInfoDeserialize bootstrapState peerP2pAddresses hujiao:", buffer + offset, sizeof(ncclSocketAddress));
+//   ////printBinaryData("ncclInfoDeserialize bootstrapState peerP2pAddresses hujiao2:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+//   offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerP2pAddresses, buffer + offset), info->nranks);
+//   //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[2], sizeof(ncclSocketAddress));
+//   //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[1], sizeof(ncclSocketAddress));
+//   //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+//   return offset;
+// }
+
+// ncclCommTrans 反序列化(核心修改:按顺序解析新增的三个 int 字段)
+template <>
+inline size_t ncclInfoDeserialize(struct ncclCommTrans *info) {
+  size_t offset = 0;
+  char *buffer = (char *)info;  // buffer 为输入数据缓冲区,info 为输出结构体
+  offset += sizeof(ncclCommTrans);
+  INFO(NCCL_INIT,"ncclInfoDeserialize info->nRanks: %d", info->nRanks);
+
+  // // 1. 反序列化基础字段(核心修改:匹配序列化顺序,解析新增字段)
+  // // 1.1 先解析 nRanks(动态数组长度依赖此值)
+  // memcpy(&info->nRanks, buffer + offset, sizeof(info->nRanks));
+  // offset += sizeof(info->nRanks);
+  // // 1.2 解析新增的 cpuArch(CPU 架构)
+  // memcpy(&info->cpuArch, buffer + offset, sizeof(info->cpuArch));
+  // offset += sizeof(info->cpuArch);
+  // // 1.3 解析新增的 cpuVendor(CPU 厂商)
+  // memcpy(&info->cpuVendor, buffer + offset, sizeof(info->cpuVendor));
+  // offset += sizeof(info->cpuVendor);
+  // // 1.4 解析新增的 nNodes(节点总数)
+  // memcpy(&info->nNodes, buffer + offset, sizeof(info->nNodes));
+  // offset += sizeof(info->nNodes);
+  // // 1.5 解析 cudaDev(CUDA 设备号)
+  // memcpy(&info->cudaDev, buffer + offset, sizeof(info->cudaDev));
+  // offset += sizeof(info->cudaDev);
+  // // 1.6 解析固定大小的 graphs 数组
+  // memcpy(info->graphs, buffer + offset, sizeof(info->graphs));
+  // offset += sizeof(info->graphs);
+
+  // 无 GPU 时直接返回
+
+  // //printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+  if (info->nRanks == 0) {
+    return offset;
+  }
+
+  // 2. 反序列化动态数组字段(原逻辑完全保留,无修改)
+  size_t size = info->nRanks * MAXCHANNELS * sizeof(int);
+  if (info->peerRings != nullptr) {
+    info->peerRings = reinterpret_cast<decltype(info->peerRings)>(buffer + offset);
+  }
+  //printBinaryData("ncclInfoDeserialize info->peerRings", buffer + offset, size);
+  offset += size;
+  size = info->nRanks * sizeof(struct ncclTopoRanks);
+  if (info->peerTopo != nullptr) {
+    info->peerTopo = reinterpret_cast<decltype(info->peerTopo)>(buffer + offset);
+  }
+  //printBinaryData("ncclInfoDeserialize info->peerTopo", buffer + offset, 20);
+  offset += size;
+  INFO(NCCL_INIT,"ncclInfoDeserialize offset %ld", offset);
+  size = info->nRanks * sizeof(int);
+  INFO(NCCL_INIT,"info->nodesFirstRank %p", info->nodesFirstRank);
+  if (info->nodesFirstRank != nullptr) {
+    INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesFirstRank: %p", info->nodesFirstRank);
+    info->nodesFirstRank = reinterpret_cast<decltype(info->nodesFirstRank)>(buffer + offset);
+    INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesFirstRank: %p", info->nodesFirstRank);
+  }
+  //printf("ncclInfoDeserialize info->nodesFirstRank: %d\n", info->nodesFirstRank[1]);
+  //printBinaryData("ncclInfoDeserialize info->nodesFirstRank", buffer + offset, size);
+  offset += size;
+  size = info->nRanks * sizeof(int);
+  if (info->nodesTreePatterns != nullptr) {
+    info->nodesTreePatterns = reinterpret_cast<decltype(info->nodesTreePatterns)>(buffer + offset);
+    INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesTreePatterns: %p", info->nodesTreePatterns);
+  }
+  offset += size;
+  size = info->nRanks * MAXCHANNELS * sizeof(int);
+  if (info->ringPrev != nullptr) {
+    info->ringPrev = reinterpret_cast<decltype(info->ringPrev)>(buffer + offset);
+  }
+  offset += size;
+  size = info->nRanks * MAXCHANNELS * sizeof(int);
+  if (info->ringNext != nullptr) {
+    info->ringNext = reinterpret_cast<decltype(info->ringNext)>(buffer + offset);
+  }
+  offset += size;
+  size = info->nRanks * sizeof(int);
+  if (info->rankToNode != nullptr) {
+    info->rankToNode = reinterpret_cast<decltype(info->rankToNode)>(buffer + offset);
+    INFO(NCCL_INIT,"ncclInfoDeserialize info->rankToNode: %p", info->rankToNode);
+  }
+  // ?????
+  //printf("ncclInfoDeserialize info->rankToNode: %d\n", info->rankToNode[1]);
+  //printBinaryData("ncclInfoDeserialize info->rankToNode", buffer + offset, size);
+  offset += size;
+  size = (info->nRanks + 1) * sizeof(ncclPeerInfo);
+  // 3. 反序列化 peerInfo(原逻辑保留)
+  if (info->peerInfo != nullptr) {
+    info->peerInfo = reinterpret_cast<decltype(info->peerInfo)>(buffer + offset);
+  }
+  offset += size;
+  ////printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+  // 4. 反序列化 bootstrapState(原逻辑保留)
+  if (info->bootstrap != nullptr) {
+    //printBinaryData("ncclInfoDeserialize info->bootstrap", buffer + offset, 600);
+    offset += ncclInfoDeserialize((bootstrapState*)SERIAL_ASSIGN(info->bootstrap, buffer + offset));
+    // offset += ncclInfoDeserialize(reinterpret_cast<bootstrapState*>(buffer + offset));
+  }
+
+  return offset;
+}
+
+
+// 原 ncclCommTransUniqueIdInfo 序列化/反序列化逻辑(注释保留,用户确认无需启用)
+// template <>
+// inline size_t ncclInfoSerialize(char *buffer, const ncclCommTransUniqueIdInfo *info) {
+//   size_t offset = 0;
+//   offset += ncclInfoSerializeBase(buffer + offset, info);
+//   offset += ncclInfoSerialize(buffer + offset, info->commTrans);
+//   offset += ncclInfoSerialize(buffer + offset, info->uniqueId);
+//   return offset;
+// }
+
+// template <>
+// inline size_t ncclInfoDeserialize(struct ncclCommTransUniqueIdInfo *info) {
+//   size_t offset = 0;
+//   char *buffer = (char *)info;
+//   offset += ncclInfoDeserializeBase(info);
+//   offset += ncclInfoDeserialize(SERIAL_ASSIGN(info->commTrans, buffer + offset));
+//   offset += ncclInfoDeserialize(SERIAL_ASSIGN(info->uniqueId, buffer + offset));
+//   return offset;
+// }
+
+
+#endif // NCCL_SERIALIZE_H_
\ No newline at end of file
diff --git a/src/init.cc b/src/init.cc
index af784c0..ab0a2ba 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,8 +1,8 @@
 /*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #include "nccl.h"
 #include "channel.h"
@@ -31,6 +31,10 @@
 #include "param.h"
 #include "nvtx_payload_schemas.h"
 #include "utils.h"
+#include "net.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -41,6 +45,21 @@
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif
 
+// struct ncclCommTrans {
+//   struct ncclTopoRanks* peerTopo;//长度nRanks
+//   struct ncclPeerInfo* peerInfo;
+//   int* nodesFirstRank;//长度nRanks
+//   int* nodesTreePatterns;//长度nRanks
+//   int* ringPrev;//长度nRanks*MAXCHANNELS
+//   int* ringNext;//长度nRanks*MAXCHANNELS
+//   int* peerRings;//长度nRanks*MAXCHANNELS
+//   void* bootstrap;
+//   int nRanks;  // number of GPUs in communicator
+//   int cudaDev; 
+//   int* rankToNode;
+//   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
@@ -83,7 +102,7 @@ static void initOnceFunc() {
 exit:;
 }
 
-static ncclResult_t ncclInit() {
+ncclResult_t ncclInit() {
   pthread_once(&initOnceControl, initOnceFunc);
   return initResult;
 }
@@ -187,8 +206,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
   NCCLCHECK(ncclRasCommFini(comm));
 
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
-   * free all intra-process communicators; therefore, we only need to focus on local
-   * resource cleanup in commFree(). */
+  * free all intra-process communicators; therefore, we only need to focus on local
+  * resource cleanup in commFree(). */
   if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
     PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
     if (comm->proxyState->threadUDS) {
@@ -326,7 +345,7 @@ exit:
   return ret;
 }
 
-static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
   if (ndev < 1) {
     WARN("invalid device count (%d) requested", ndev);
     return ncclInvalidArgument;
@@ -433,7 +452,227 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   return ncclSuccess;
 }
 
-static ncclResult_t devCommSetup(ncclComm_t comm) {
+ncclResult_t commAllocNew(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclMemoryStackConstruct(&comm->memPermanent);
+  ncclMemoryStackConstruct(&comm->memScoped);
+  comm->destructorHead = nullptr;
+  comm->rank = rank;
+  comm->nRanks = ndev;
+
+  NCCLCHECK(ncclNetInit(comm));
+  INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
+
+  if (parent && parent->shareResources) {
+    if (parent->ncclNet != comm->ncclNet) {
+      WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
+      return ncclInvalidUsage;
+    }
+  }
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  CUDACHECK(cudaGetDevice(&comm->cudaDev));
+
+  NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
+  NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+  nvmlDevice_t nvmlDev;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  NCCLCHECK(int64ToBusId(comm->busId, busId));
+  NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
+  NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev));
+
+  comm->compCap = ncclCudaCompCap();
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
+
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+  comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
+
+  memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
+
+  ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+
+  for (int i = 0; i < ncclGroupTaskTypeNum; i++) {
+    comm->groupNext[i] = reinterpret_cast<struct ncclComm*>(0x1);
+  }
+  comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+  // Mark channels as non initialized.
+  for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+
+  // if (parent == NULL || !parent->shareResources) {
+  //   struct ncclSharedResources* sharedRes = NULL;
+  //   NCCLCHECK(ncclCalloc(&sharedRes, 1));
+  //   /* most of attributes are assigned later in initTransportsRank(). */
+  //   sharedRes->owner = comm;
+  //   sharedRes->tpNRanks = comm->nRanks;
+  //   NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+  //   NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+  //   NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+  //   CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+  //   CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
+  //   comm->sharedRes = sharedRes;
+  //   sharedRes->refCount = 1;
+  // } else {
+  //   comm->sharedRes = parent->sharedRes;
+  //   ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+  // }
+
+  if (comm->topParentRanks == NULL) {
+    NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+    for (int i = 0; i < comm->nRanks; ++i)
+      comm->topParentRanks[i] = i;
+  }
+
+  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+  ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
+
+  comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
+
+  do {
+    cudaMemPoolProps props = {};
+    props.allocType = cudaMemAllocationTypePinned;
+    props.handleTypes = cudaMemHandleTypeNone;
+    props.location.type = cudaMemLocationTypeDevice;
+    props.location.id = comm->cudaDev;
+    CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
+    uint64_t releaseThreshold = ~uint64_t(0);
+    CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
+  } while (0);
+
+  ncclIntruQueueConstruct(&comm->eventCallbackQueue);
+
+  return ncclSuccess;
+}
+
+// ncclResult_t commAllocNew(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+//   INFO(NCCL_INIT,"a11111111111");
+//   if (ndev < 1) {
+//     WARN("invalid device count (%d) requested", ndev);
+//     return ncclInvalidArgument;
+//   }
+//   if (rank >= ndev || rank < 0) {
+//     WARN("rank %d exceeds ndev=%d", rank, ndev);
+//     return ncclInvalidArgument;
+//   }
+
+//   ncclMemoryStackConstruct(&comm->memPermanent);
+//   ncclMemoryStackConstruct(&comm->memScoped);
+//   comm->destructorHead = nullptr;
+//   comm->rank = rank;
+//   comm->nRanks = ndev;
+//   INFO(NCCL_INIT,"a11111111111");
+//   NCCLCHECK(ncclNetPluginLoad(comm));
+//   NCCLCHECK(ncclNetInit(comm));
+//   NCCLCHECK(ncclProfilerPluginInit(comm));
+//   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
+
+//   if (parent && parent->config.splitShare) {
+//     if (parent->ncclNet != comm->ncclNet) {
+//       WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
+//       return ncclInvalidUsage;
+//     }
+//   }
+//   // Try to create a CUDA object right away. If there is something wrong with
+//   // the device we're on (failure cause #1) , better know it early.
+//   CUDACHECK(cudaGetDevice(&comm->cudaDev));
+
+//   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+//   INFO(NCCL_INIT,"a222222222");
+//   nvmlDevice_t nvmlDev;
+//   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+//   NCCLCHECK(int64ToBusId(comm->busId, busId));
+//   NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
+//   NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev));
+
+//   comm->compCap = ncclCudaCompCap();
+//   TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
+
+//   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+//   comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
+
+//   comm->collNetSupport = 0;
+//   memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
+//   INFO(NCCL_INIT,"a333333333");
+//   ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+//   ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+
+//   comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+//   comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+
+//   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+//   static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+//   NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+//   NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+//   // Mark channels as non initialized.
+//   for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+//   //hjx
+//   if (parent == NULL || !parent->config.splitShare) {
+//     struct ncclSharedResources* sharedRes = NULL;
+//     NCCLCHECK(ncclCalloc(&sharedRes, 1));
+//     /* most of attributes are assigned later in initTransportsRank(). */
+//     sharedRes->owner = comm;
+//     sharedRes->tpNRanks = comm->nRanks;
+//     NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+//     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+//     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+//     comm->sharedRes = sharedRes;
+//     sharedRes->refCount = 1;
+//   } else {
+//     comm->sharedRes = parent->sharedRes;
+//     ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+//   }
+//   INFO(NCCL_INIT,"a444444444");
+
+//   // if (comm->topParentRanks == NULL) {
+//   //   NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+//   //   for (int i = 0; i < comm->nRanks; ++i)
+//   //     comm->topParentRanks[i] = i;
+//   // }
+
+//   ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+//   ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
+
+//   comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
+//   INFO(NCCL_INIT,"a5555555555");
+//   do {
+//     cudaMemPoolProps props = {};
+//     props.allocType = cudaMemAllocationTypePinned;
+//     props.handleTypes = cudaMemHandleTypeNone;
+//     props.location.type = cudaMemLocationTypeDevice;
+//     props.location.id = comm->cudaDev;
+//     CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
+//     uint64_t releaseThreshold = ~uint64_t(0);
+//     CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
+//   } while (0);
+//   INFO(NCCL_INIT,"a666666666");
+//   ncclIntruQueueConstruct(&comm->eventCallbackQueue);
+  
+
+//   //  setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
+//   comm->intraComm0 = comm;
+//   comm->intraRank = 0;
+//   comm->intraRanks = 1;
+
+//   return ncclSuccess;
+// }
+
+ncclResult_t devCommSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
   int nRanks = comm->nRanks;
   struct ncclDevCommAndChannels tmpCommAndChans;
@@ -588,9 +827,9 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
       }
       if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
       INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
-           info->busId,
-           ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
-           info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
+          info->busId,
+          ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
+          info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
     }
   }
 
@@ -615,6 +854,25 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
   return ncclSuccess;
 }
 
+ncclResult_t setupChannelNew(struct ncclComm* comm, int channelId, int rank, int nRanks, int* ringRanks) {
+  INFO(NCCL_INIT, "rank %d nRanks %d", rank, nRanks);
+  NCCLCHECK(initChannelNew(comm, channelId));
+  //INFO(NCCL_INIT,"setupChannelllllllllll");
+  
+  struct ncclRing* ring = &comm->channels[channelId].ring;
+  // Find our ring-distance from rank zero and reorganize ranks to start with rank.
+  int ixZero=0, ixRank=0;
+  for (int i=0; i < nRanks; i++) {
+    if (ringRanks[i] == 0) ixZero = i;
+    if (ringRanks[i] == rank) ixRank = i;
+  }
+  ring->index = (ixRank-ixZero + nRanks)%nRanks;
+  for (int i=0; i<nRanks; i++) {
+    ring->userRanks[i] = ringRanks[(i+ixRank)%nRanks];
+  }
+  return ncclSuccess;
+  }
+
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
 #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
@@ -670,10 +928,1294 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 #define TIMER_INIT_ALLOC 7
 #define TIMERS_INIT_COUNT 8
 
-static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
-  // We use 2 AllGathers
-  // 1. { peerInfo, comm, compCap}
-  // 2. { nChannels, graphInfo, topoRanks }
+ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
+  // We use 2 AllGathers
+  // 1. { peerInfo, comm, compCap}
+  // 2. { nChannels, graphInfo, topoRanks }
+  ncclResult_t ret = ncclSuccess;
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  int nNodes = 1;
+  cpu_set_t affinitySave;
+  struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
+  struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
+  struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
+  struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
+  struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
+
+  struct graphInfo {
+    int pattern;
+    int nChannels;
+    int sameChannels;
+    float bwIntra;
+    float bwInter;
+    int typeIntra;
+    int typeInter;
+    int crossNic;
+  };
+
+  struct allGatherInfo {
+    struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+    struct ncclTopoRanks topoRanks;
+    int cpuArch;
+    int cpuVendor;
+    int localRanks;
+  };
+
+  int nChannelsOrig;
+  struct allGatherInfo *allGather3Data = NULL;
+  struct ncclTopoRanks** allTopoRanks = NULL;
+  int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+  int *rings = NULL;
+  int* nvbPeers = NULL;
+  struct ncclProxyConnector proxyConn;
+  int* pxnPeers = NULL;
+  int *topParentLocalRanks = NULL;
+  int p2pLevel = -1;
+
+  timers[TIMER_INIT_ALLGATHER] = clockNano();
+  // AllGather1 - begin
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
+  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
+  __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
+
+  comm->cuMemSupport = 1;
+  for (int i = 0; i < nranks; i++) {
+    if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
+      WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
+          i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
+    if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
+    if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
+  }
+  // AllGather1 - end
+  timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+
+  // Check for MNNVL support
+  NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
+  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
+    NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
+  }
+
+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+
+    comm->nvlsRegSupport = 1;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
+      }
+
+      if (comm->nvlsRegSupport) {
+        for (int j = i + 1; j < nranks; j++) {
+          if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+            comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+            comm->nvlsRegSupport = 0;
+            break;
+          }
+        }
+      }
+    }
+
+    // Buffer Registration is not supported with MNNVL
+    if (comm->MNNVL) comm->nvlsRegSupport = 0;
+
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      ret = ncclInternalError;
+      goto fail;
+    }
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
+  } while(0);
+
+  timers[TIMER_INIT_TOPO] = clockNano();
+
+  // Dump XML if requested by user
+  const char* dumpXmlFile;
+  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+  if (dumpXmlFile) {
+    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+  }
+
+  // Topo detection / System graph creation
+  NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+  // Compute paths between GPUs and NICs
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+  // Remove inaccessible GPUs and unused NICs
+  NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
+  // Recompute paths after trimming
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+  // Init search
+  NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
+  // Decide on comm's CPU architecture.
+  NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
+  // Print final topology
+  NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
+  timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
+
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
+  if (CPU_COUNT(&comm->cpuAffinity)) {
+    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  }
+
+  // Determine local CollNet support
+  if (!collNetSupport(comm)) {
+    comm->config.collnetEnable = 0;
+  }
+
+  // Determine local Nvls support
+  NCCLCHECK(ncclNvlsInit(comm));
+
+  timers[TIMER_INIT_GRAPHS] = clockNano();
+  // Get rings and trees
+  memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
+  ringGraph->id = 0;
+  ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
+  ringGraph->minChannels = 1;
+  ringGraph->maxChannels = MAXCHANNELS/2;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
+
+  memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
+  treeGraph->id = 1;
+  treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
+  treeGraph->minChannels = ringGraph->nChannels;
+  treeGraph->maxChannels = ringGraph->nChannels;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
+
+  memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetChainGraph->id = 2;
+  collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetChainGraph->collNet = 1;
+  collNetChainGraph->minChannels = ringGraph->nChannels;
+  collNetChainGraph->maxChannels = ringGraph->nChannels;
+
+  memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetDirectGraph->id = 4;
+  collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
+  collNetDirectGraph->collNet = 1;
+  collNetDirectGraph->minChannels = 1;
+  collNetDirectGraph->maxChannels = MAXCHANNELS;
+  if (comm->config.collnetEnable) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
+  }
+
+  memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
+  nvlsGraph->id = 3;
+  nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
+  nvlsGraph->minChannels = 1;
+  nvlsGraph->maxChannels = MAXCHANNELS;
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
+  }
+  timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
+
+  // Initialize num P2P LL buffers for this communicator
+  comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
+
+  if (comm->rank == ncclParamGraphDumpFileRank()) {
+    struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
+  }
+
+  // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
+  // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
+  timers[TIMER_INIT_CONNECT] = clockNano();
+  // AllGather3 - begin
+  NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
+
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+    allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+    allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+    allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+    allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+    allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+    allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
+  }
+
+  allGather3Data[rank].cpuArch = comm->cpuArch;
+  allGather3Data[rank].cpuVendor = comm->cpuVendor;
+
+  comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
+
+  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+
+  // Determine nNodes, firstRanks, ...
+  NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, comm->nRanks), ret, fail);
+  INFO(NCCL_INIT,"nodes llllyyyy%d", comm->nNodes);
+  for (int r=0; r<nranks; r++) {
+    int node;
+    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+    if (node == comm->nNodes) {
+      comm->nNodes++;
+      nodesFirstRank[node] = firstRank;
+      comm->nodesFirstRank[node] = nodesFirstRank[node];
+      // Record tree pattern of each node as they can be different depending on sm arch
+      nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+      comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+    }
+    comm->rankToNode[r] = node;
+
+    if (comm->cpuArch != allGather3Data[r].cpuArch &&
+        comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+      comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+    }
+    if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
+        comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+      comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+    }
+  }
+
+  // Alert the user to the presence of mixed CPUs. In the past this has caused
+  // locks in some collective routines. This may help debug issues in the future.
+  if (rank==0) {
+    if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+    }
+    if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
+    }
+  }
+
+  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+    comm->nodeRanks[node].localRanks++;
+  }
+  // Allocate ranks arrays for each node
+  for (int n=0; n<comm->nNodes; n++) {
+    NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
+    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->nodeRanks[n].localRanks = 0;
+  }
+  // And fill the ranks arrays
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+  }
+  comm->node = comm->rankToNode[rank];
+  comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+  comm->localRank = comm->rankToLocalRank[rank];
+  comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+  TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+  if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+        comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+    ret = ncclInternalError;
+    goto fail;
+  }
+
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
+      comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+
+  nChannelsOrig = comm->nChannels;
+  NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, comm->nRanks), ret, fail);
+  for (int i=0; i<nranks; i++) {
+    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    comm->peerTopo[i] = allGather3Data[i].topoRanks;
+    // Make sure we align all ranks so that the tuning is consistent across ranks
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+      graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+      graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+      graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+      graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+      graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
+    }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+  }
+  if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
+  if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
+
+  comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+  if (comm->nChannels < nChannelsOrig) {
+    // We started duplicating channels during Preset(), so we need to move the
+    // duplicated channels since we have removed some.
+    for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+  }
+
+  // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+  if (comm->config.collnetEnable == 1) {
+    int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+    if (comm->nNodes < collNetNodeThreshold) {
+      INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
+      comm->config.collnetEnable = 0;
+    }
+  }
+  NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
+  comm->isOneRPN = (comm->maxLocalRanks == 1);
+
+  NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
+  // AllGather3 - end
+  timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+  char line[1024];
+  line[0]='\0';
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclTree* tree = &comm->channels[c].tree;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+        c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
+    INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+  }
+  line[1023] = '\0';
+  INFO(NCCL_INIT, "Trees%s", line);
+
+  NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
+
+  // Compute nChannels per peer for p2p
+  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+  /* until now, all info of comm should be known. We can initialize shared resources and
+  * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+  * all proxy operations. */
+  if (comm->sharedRes->owner == comm) {
+    comm->sharedRes->tpNLocalRanks = comm->localRanks;
+    comm->sharedRes->magic = comm->magic;
+    comm->sharedRes->tpNChannels = comm->nChannels;
+    comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+    memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+  }
+  NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+  for (int i = 0; i < comm->localRanks; ++i) {
+    int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+    topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+  }
+  comm->topParentLocalRanks = topParentLocalRanks;
+
+  // Profiler plugin context has to be initialized before proxy thread
+  NCCLCHECK(ncclProfilerPluginInit(comm));
+
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+  // Launch proxy service thread, after this, the proxy calls can be used.
+  if (parent && parent->shareResources) {
+    comm->proxyState = parent->sharedRes->proxyState;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+  } else {
+    NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
+
+  timers[TIMER_INIT_CONNECT] = clockNano();
+  do { // Build p2p schedule
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    int nRanks = comm->nRanks;
+    int local = comm->localRank;
+    int nLocals = comm->maxLocalRanks;
+    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+    bool flat = false;
+    for (int node = 0; node < nNodes; node++) {
+      if (nodeRanks[node].localRanks != nLocals) {
+        flat = true;
+        nNodes = 1; node = 0;
+        nLocals = nRanks; local = rank;
+        break;
+      }
+    }
+    int nNodesPow2 = pow2Up(nNodes);
+    int nLocalsPow2 = pow2Up(nLocals);
+    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
+    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
+    uint32_t nodeRound = 0;
+    uint32_t nodeDelta = 0;
+    int round = 0;
+    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+    // Since that formula only produces valid permutations when N is a pow of 2,
+    // we let N = pow2Up(n) and filter out results greater-eq to n.
+    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+    do {
+      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
+        int sendNode = (node + nodeDelta) % nNodes;
+        int recvNode = (node - nodeDelta + nNodes) % nNodes;
+        uint32_t localRound = 0;
+        uint32_t localDelta = 0;
+        do {
+          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
+            int sendLocal = (local + localDelta) % nLocals;
+            int recvLocal = (local - localDelta + nLocals) % nLocals;
+            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
+            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
+            round += 1;
+          }
+          localRound += 1;
+          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
+        } while (localRound != nLocalsPow2);
+      }
+      nodeRound += 1;
+      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
+    } while (nodeRound != nNodesPow2);
+
+    if (round != nRanks) {
+      WARN("P2p schedule creation has bugs.");
+      ret = ncclInternalError;
+      goto fail;
+    }
+  } while (0);
+
+  //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+  comm->runtimeConn = 1;
+  if (comm->runtimeConn) {
+    for (int c=0; c<comm->nChannels; c++) {
+      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    }
+    // Attempt to setup NVLS, may silently fail and disable NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    // Check if we can setup CollNet
+    if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
+  } else {
+    for (int c=0; c<comm->nChannels; c++) {
+      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+
+    // Connect Trees
+    NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+
+    // Connect PAT only for communicators with 1 GPU per node
+    if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+
+    // Attempt to setup NVLS, may silently fail and disable NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+
+    // And NVLS trees if needed
+    NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+
+    // Check if we can setup CollNet
+    if (comm->config.collnetEnable) {
+      ncclCollNetSetup(comm, parent, graphs);
+      NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+      if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+        NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+      }
+    }
+
+    // Connect to local net proxy
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+
+    // Then to remote ones when using PXN
+    if (ncclPxnDisable(comm) == 0) {
+      int nranks;
+      NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
+        NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+      }
+    }
+
+    if (ncclParamNvbPreconnect()) {
+      // Connect p2p when using NVB path
+      int nvbNpeers;
+      NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
+      for (int r=0; r<nvbNpeers; r++) {
+        int peer = nvbPeers[r];
+        int sendRound=0, recvRound=0;
+        while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
+        while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
+        uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
+        uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
+        for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+          int channelId;
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
+          if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
+            comm->connectSend[peer] |= (1UL<<channelId);
+          }
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
+          if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
+            comm->connectRecv[peer] |= (1UL<<channelId);
+          }
+        }
+      }
+
+      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+    }
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+
+  // Compute time models for algorithm and protocol combinations
+  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+
+  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+    const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
+    enum ncclLaunchMode mode, modeOld;
+    if (str && strcasecmp(str, "GROUP") == 0) {
+      mode = ncclLaunchModeGroup;
+    } else {
+      mode = ncclLaunchModeParallel;
+    }
+    // In theory we could be racing with other communicators not associated with
+    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+    }
+  }
+
+  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+  comm->baseStride = 0;
+
+  // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
+  // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
+  NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+  timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
+  /* Local intra-node barrier */
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+
+  // We should have allocated all buffers, collective fifos, ... we can
+  // restore the affinity.
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+exit:
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+  * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+  * properly cleaned up. */
+  if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+  free(allTopoRanks);
+  free(nodesTreePatterns);
+  free(nodesFirstRank);
+  free(allGather3Data);
+  free(rings);
+  free(nvbPeers);
+  free(pxnPeers);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t updateTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
+  // We use 2 AllGathers
+  // 1. { peerInfo, comm, compCap}
+  // 2. { nChannels, graphInfo, topoRanks }
+  ncclResult_t ret = ncclSuccess;
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  int nNodes = 1;
+  cpu_set_t affinitySave;
+  struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
+  struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
+  struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
+  struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
+  struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
+
+  struct graphInfo {
+    int pattern;
+    int nChannels;
+    int sameChannels;
+    float bwIntra;
+    float bwInter;
+    int typeIntra;
+    int typeInter;
+    int crossNic;
+  };
+
+  struct allGatherInfo {
+    struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+    struct ncclTopoRanks topoRanks;
+    int cpuArch;
+    int cpuVendor;
+    int localRanks;
+  };
+
+  int nChannelsOrig;
+  struct allGatherInfo *allGather3Data = NULL;
+  struct ncclTopoRanks** allTopoRanks = NULL;
+  int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+  int *rings = NULL;
+  int* nvbPeers = NULL;
+  struct ncclProxyConnector proxyConn;
+  int* pxnPeers = NULL;
+  int *topParentLocalRanks = NULL;
+  int p2pLevel = -1;
+  comm->initAlgoChannels[NCCL_ALGO_RING] = false;
+
+  timers[TIMER_INIT_ALLGATHER] = clockNano();
+  // AllGather1 - begin
+  NCCLCHECKGOTO(ncclRealloc(&comm->peerInfo,nranks, nranks+1), ret, fail); // Extra rank to represent CollNet root
+  // NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+
+  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
+  __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
+
+  comm->cuMemSupport = 1;
+  for (int i = 0; i < nranks; i++) {
+    if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
+      WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
+          i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
+    if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
+    if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
+  }
+  // AllGather1 - end
+  timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+
+  // Check for MNNVL support
+  NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
+  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
+    NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
+  }
+
+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+
+    comm->nvlsRegSupport = 1;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
+      }
+
+      if (comm->nvlsRegSupport) {
+        for (int j = i + 1; j < nranks; j++) {
+          if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+            comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+            comm->nvlsRegSupport = 0;
+            break;
+          }
+        }
+      }
+    }
+
+    // Buffer Registration is not supported with MNNVL
+    if (comm->MNNVL) comm->nvlsRegSupport = 0;
+
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      ret = ncclInternalError;
+      goto fail;
+    }
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
+  } while(0);
+
+  timers[TIMER_INIT_TOPO] = clockNano();
+  // Dump XML if requested by user
+  const char* dumpXmlFile;
+  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+  if (dumpXmlFile) {
+    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+  }
+  comm->topParentRanks = NULL;
+  if (comm->topParentRanks == NULL) {
+    NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+    for (int i = 0; i < comm->nRanks; ++i)
+      comm->topParentRanks[i] = i;
+  }
+
+  // Topo detection / System graph creation
+  NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+  // Compute paths between GPUs and NICs
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+  // Remove inaccessible GPUs and unused NICs
+  // NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
+  // Recompute paths after trimming
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+  // Init search
+  NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
+  // Decide on comm's CPU architecture.
+  NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
+  // Print final topology
+  NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
+  timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
+
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
+  if (CPU_COUNT(&comm->cpuAffinity)) {
+    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  }
+
+  // Determine local CollNet support
+  if (!collNetSupport(comm)) {
+    comm->config.collnetEnable = 0;
+  }
+
+  // Determine local Nvls support
+  NCCLCHECK(ncclNvlsInit(comm));
+
+  timers[TIMER_INIT_GRAPHS] = clockNano();
+  // Get rings and trees
+  memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
+  ringGraph->id = 0;
+  ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
+  ringGraph->minChannels = 1;
+  ringGraph->maxChannels = MAXCHANNELS/2;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
+
+  memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
+  treeGraph->id = 1;
+  treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
+  treeGraph->minChannels = ringGraph->nChannels;
+  treeGraph->maxChannels = ringGraph->nChannels;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
+
+  memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetChainGraph->id = 2;
+  collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetChainGraph->collNet = 1;
+  collNetChainGraph->minChannels = ringGraph->nChannels;
+  collNetChainGraph->maxChannels = ringGraph->nChannels;
+
+  memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetDirectGraph->id = 4;
+  collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
+  collNetDirectGraph->collNet = 1;
+  collNetDirectGraph->minChannels = 1;
+  collNetDirectGraph->maxChannels = MAXCHANNELS;
+  if (comm->config.collnetEnable) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
+  }
+
+  memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
+  nvlsGraph->id = 3;
+  nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
+  nvlsGraph->minChannels = 1;
+  nvlsGraph->maxChannels = MAXCHANNELS;
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
+  }
+  timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
+
+  // Initialize num P2P LL buffers for this communicator
+  comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
+
+  if (comm->rank == ncclParamGraphDumpFileRank()) {
+    struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
+  }
+
+  // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
+  // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
+  timers[TIMER_INIT_CONNECT] = clockNano();
+  // AllGather3 - begin
+  NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
+
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+    allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+    allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+    allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+    allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+    allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+    allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
+  }
+
+  allGather3Data[rank].cpuArch = comm->cpuArch;
+  allGather3Data[rank].cpuVendor = comm->cpuVendor;
+
+  comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
+
+  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+
+  // Determine nNodes, firstRanks, ...
+  NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, comm->nRanks), ret, fail);
+  INFO(NCCL_INIT,"nNODES llllyyyy%d", comm->nNodes);
+  comm->nNodes = 0;
+  for (int r=0; r<nranks; r++) {
+    int node;
+    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+    if (node == comm->nNodes) {
+      comm->nNodes++;
+      nodesFirstRank[node] = firstRank;
+      comm->nodesFirstRank[node] = nodesFirstRank[node];
+      // Record tree pattern of each node as they can be different depending on sm arch
+      nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+      comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+    }
+    comm->rankToNode[r] = node;
+
+    if (comm->cpuArch != allGather3Data[r].cpuArch &&
+        comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+      comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+    }
+    if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
+        comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+      comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+    }
+  }
+
+  // Alert the user to the presence of mixed CPUs. In the past this has caused
+  // locks in some collective routines. This may help debug issues in the future.
+  if (rank==0) {
+    if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+    }
+    if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
+    }
+  }
+
+  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+    comm->nodeRanks[node].localRanks++;
+  }
+  // Allocate ranks arrays for each node
+  for (int n=0; n<comm->nNodes; n++) {
+    NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
+    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->nodeRanks[n].localRanks = 0;
+  }
+  // And fill the ranks arrays
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+  }
+  comm->node = comm->rankToNode[rank];
+  comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+  comm->localRank = comm->rankToLocalRank[rank];
+  comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+  TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+  if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+        comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+    ret = ncclInternalError;
+    goto fail;
+  }
+
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
+      comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+
+  nChannelsOrig = comm->nChannels;
+  NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, comm->nRanks), ret, fail);
+  for (int i=0; i<nranks; i++) {
+    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    comm->peerTopo[i] = allGather3Data[i].topoRanks;
+    // Make sure we align all ranks so that the tuning is consistent across ranks
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+      graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+      graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+      graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+      graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+      graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
+    }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+  }
+  if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
+  if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
+
+  comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+  if (comm->nChannels < nChannelsOrig) {
+    // We started duplicating channels during Preset(), so we need to move the
+    // duplicated channels since we have removed some.
+    for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+  }
+
+  // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+  if (comm->config.collnetEnable == 1) {
+    int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+    if (comm->nNodes < collNetNodeThreshold) {
+      INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
+      comm->config.collnetEnable = 0;
+    }
+  }
+  NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
+  comm->isOneRPN = (comm->maxLocalRanks == 1);
+
+  NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
+  // AllGather3 - end
+  timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+  char line[1024];
+  line[0]='\0';
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclTree* tree = &comm->channels[c].tree;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+        c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
+    INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+  }
+  line[1023] = '\0';
+  INFO(NCCL_INIT, "Trees%s", line);
+
+  NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
+
+  // Compute nChannels per peer for p2p
+  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+  /* until now, all info of comm should be known. We can initialize shared resources and
+  * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+  * all proxy operations. */
+  if (comm->sharedRes->owner == comm) {
+    comm->sharedRes->tpNLocalRanks = comm->localRanks;
+    comm->sharedRes->magic = comm->magic;
+    comm->sharedRes->tpNChannels = comm->nChannels;
+    comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+    memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+  }
+  NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+  for (int i = 0; i < comm->localRanks; ++i) {
+    int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+    topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+  }
+  comm->topParentLocalRanks = topParentLocalRanks;
+
+  // Profiler plugin context has to be initialized before proxy thread
+  NCCLCHECK(ncclProfilerPluginInit(comm));
+
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+  // Launch proxy service thread, after this, the proxy calls can be used.
+  // if (parent && parent->shareResources) {
+  //   comm->proxyState = parent->sharedRes->proxyState;
+  //   ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+  // } else {
+  //   NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  // }
+  NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
+
+  timers[TIMER_INIT_CONNECT] = clockNano();
+  do { // Build p2p schedule
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    int nRanks = comm->nRanks;
+    int local = comm->localRank;
+    int nLocals = comm->maxLocalRanks;
+    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+    bool flat = false;
+    for (int node = 0; node < nNodes; node++) {
+      if (nodeRanks[node].localRanks != nLocals) {
+        flat = true;
+        nNodes = 1; node = 0;
+        nLocals = nRanks; local = rank;
+        break;
+      }
+    }
+    int nNodesPow2 = pow2Up(nNodes);
+    int nLocalsPow2 = pow2Up(nLocals);
+    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
+    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
+    uint32_t nodeRound = 0;
+    uint32_t nodeDelta = 0;
+    int round = 0;
+    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+    // Since that formula only produces valid permutations when N is a pow of 2,
+    // we let N = pow2Up(n) and filter out results greater-eq to n.
+    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+    do {
+      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
+        int sendNode = (node + nodeDelta) % nNodes;
+        int recvNode = (node - nodeDelta + nNodes) % nNodes;
+        uint32_t localRound = 0;
+        uint32_t localDelta = 0;
+        do {
+          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
+            int sendLocal = (local + localDelta) % nLocals;
+            int recvLocal = (local - localDelta + nLocals) % nLocals;
+            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
+            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
+            round += 1;
+          }
+          localRound += 1;
+          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
+        } while (localRound != nLocalsPow2);
+      }
+      nodeRound += 1;
+      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
+    } while (nodeRound != nNodesPow2);
+
+    if (round != nRanks) {
+      WARN("P2p schedule creation has bugs.");
+      ret = ncclInternalError;
+      goto fail;
+    }
+  } while (0);
+
+  //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+  comm->runtimeConn = 1;
+  if (comm->runtimeConn) {
+    // for (int c=0; c<comm->nChannels; c++) {
+    //   NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    // }
+    for (int c = 0; c < comm->nChannels; c++) {
+      // (&comm->channels[c])->peers = NULL;
+      NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    }
+    // Attempt to setup NVLS, may silently fail and disable NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    // Check if we can setup CollNet
+    if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
+  } else {
+    for (int c=0; c<comm->nChannels; c++) {
+      NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+
+    // Connect Trees
+    NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+
+    // Connect PAT only for communicators with 1 GPU per node
+    if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+
+    // Attempt to setup NVLS, may silently fail and disable NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+
+    // And NVLS trees if needed
+    NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+
+    // Check if we can setup CollNet
+    if (comm->config.collnetEnable) {
+      ncclCollNetSetup(comm, parent, graphs);
+      NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+      if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+        NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+      }
+    }
+
+    // Connect to local net proxy
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+
+    // Then to remote ones when using PXN
+    if (ncclPxnDisable(comm) == 0) {
+      int nranks;
+      NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
+        NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+      }
+    }
+
+    if (ncclParamNvbPreconnect()) {
+      // Connect p2p when using NVB path
+      int nvbNpeers;
+      NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
+      for (int r=0; r<nvbNpeers; r++) {
+        int peer = nvbPeers[r];
+        int sendRound=0, recvRound=0;
+        while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
+        while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
+        uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
+        uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
+        for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+          int channelId;
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
+          if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
+            comm->connectSend[peer] |= (1UL<<channelId);
+          }
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
+          if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
+            comm->connectRecv[peer] |= (1UL<<channelId);
+          }
+        }
+      }
+
+      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+    }
+  }
+  // for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+  //   struct ncclProxyOps* ops = proxyOps + r;
+  //   INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps %d",ops->pool,ops->nextOps);
+  
+  //   ops->pool = NULL;
+  //   ops->nextOps = 0; 
+  
+  // }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+
+  // Compute time models for algorithm and protocol combinations
+  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+
+  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+    const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
+    enum ncclLaunchMode mode, modeOld;
+    if (str && strcasecmp(str, "GROUP") == 0) {
+      mode = ncclLaunchModeGroup;
+    } else {
+      mode = ncclLaunchModeParallel;
+    }
+    // In theory we could be racing with other communicators not associated with
+    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+    }
+  }
+
+  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+  comm->baseStride = 0;
+
+  // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
+  // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
+  NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+  timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
+  /* Local intra-node barrier */
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+  // comm->runtimeConn =  1;
+  // for (int c = 0; c < comm->nChannels; c++) {
+  //   //NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings + c * nranks), ret, fail);
+  //   //NCCLCHECKGOTO(setupChannel(peerComm, comm, c, rank, nranks, rings + c * nranks), ret, fail);
+  //   NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+  // }
+
+  // for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+  //   struct ncclProxyOps* ops = proxyOps + r;
+  //   INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps %d",ops->pool,ops->nextOps);
+  
+  //   ops->pool = NULL;
+  //   ops->nextOps = 0; 
+  
+  // }
+  // // // Setup NVLS
+  // // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+  // // // Check if we can setup CollNet
+  // // if (comm->collNetSupport > 0) ncclCollNetSetup(comm, NULL, graphs);
+  // // Attempt to setup NVLS, may silently fail and disable NVLS
+  // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+  // // Check if we can setup CollNet
+  // if (comm->config.collnetEnable) ncclCollNetSetup(comm, NULL, graphs);
+
+  // NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+  // NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+  // timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
+  // /* Local intra-node barrier */
+  // NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+
+  // We should have allocated all buffers, collective fifos, ... we can
+  // restore the affinity.
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+exit:
+  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  // /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+  // * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+  // * properly cleaned up. */
+  // if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+  free(allTopoRanks);
+  free(nodesTreePatterns);
+  free(nodesFirstRank);
+  free(allGather3Data);
+  free(rings);
+  free(nvbPeers);
+  free(pxnPeers);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t initTransportsNewRank(struct ncclComm* comm, const struct ncclCommTrans* peerComm) {
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
@@ -702,32 +2244,36 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     struct ncclTopoRanks topoRanks;
     int cpuArch;
     int cpuVendor;
-    int localRanks;
   };
 
   int nChannelsOrig;
-  struct allGatherInfo *allGather3Data = NULL;
-  struct ncclTopoRanks** allTopoRanks = NULL;
+  // struct allGatherInfo *allGather3Data = NULL;
+  struct ncclTopoRanks **allTopoRanks = NULL;
   int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
   int *rings = NULL;
-  int* nvbPeers = NULL;
-  struct ncclProxyConnector proxyConn;
-  int* pxnPeers = NULL;
+  // int *nvbPeers = NULL;
+  // struct ncclProxyConnector proxyConn;
+  // int *pxnPeers = NULL;
   int *topParentLocalRanks = NULL;
+  bool *nodeVis = nullptr, *firstRankVis = nullptr;
   int p2pLevel = -1;
 
-  timers[TIMER_INIT_ALLGATHER] = clockNano();
-  // AllGather1 - begin
-  NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
-  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
-  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
-  __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
-
+  // Original AllGather1 - begin
+  INFO(NCCL_INIT, "all rank %d in comm %p", nranks, comm);
+  INFO(NCCL_INIT,"initTransportsNewRank peerComm->nodesTreePatterns: %p",peerComm->nodesTreePatterns);
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks + 1), ret, fail); // Extra rank to represent CollNet root
+  memcpy(comm->peerInfo, peerComm->peerInfo, (nranks - 1) * sizeof(*comm->peerInfo));
+  NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo + rank, comm->commHash), ret, fail);
+  INFO(NCCL_INIT,"peerComm->nodesTreePatterns111: %p",peerComm->nodesTreePatterns);
+  //memcpy(comm->peerInfo + nranks, peerComm->peerInfo + nranks - 1, sizeof(*comm->peerInfo));
+  //NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+  comm->commHash = peerComm->commHash;
   comm->cuMemSupport = 1;
+  // comm->nRanks = peerComm->nRanks+1;
   for (int i = 0; i < nranks; i++) {
     if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
       WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
-           i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+          i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
       ret = ncclInvalidUsage;
       goto fail;
     }
@@ -739,10 +2285,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       goto fail;
     }
   }
-  // AllGather1 - end
-  timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+  // Original AllGather1 - end
 
-  // Check for MNNVL support
+  // MNNVL support
   NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
   if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
     NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
@@ -799,19 +2344,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     comm->intraBarrierPhase = 0;
     comm->intraBarrierCounter = 0;
     comm->intraBarrierGate = 0;
-  } while(0);
-
-  timers[TIMER_INIT_TOPO] = clockNano();
-
-  // Dump XML if requested by user
-  const char* dumpXmlFile;
-  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
-  if (dumpXmlFile) {
-    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
-  }
-
+  } while (0);//next win
   // Topo detection / System graph creation
+
+  //NCCLCHECKGOTO(ncclTopoGetSystem(peerComm, comm, &comm->topo), ret, fail);
+  //NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
   NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+  //NCCLCHECKGOTO(ncclTopoGetSystemForNew(comm, &comm->topo), ret, fail);
   // Compute paths between GPUs and NICs
   NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
   // Remove inaccessible GPUs and unused NICs
@@ -824,7 +2363,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
   // Print final topology
   NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
-  timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
 
   // Set Affinity to a CPU local the our GPU, so that all memory we allocate
   // on the host is local.
@@ -838,11 +2376,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   if (!collNetSupport(comm)) {
     comm->config.collnetEnable = 0;
   }
+      INFO(NCCL_INIT,"peerComm->nodesTreePatterns222: %p",peerComm->nodesTreePatterns);
 
   // Determine local Nvls support
   NCCLCHECK(ncclNvlsInit(comm));
 
-  timers[TIMER_INIT_GRAPHS] = clockNano();
   // Get rings and trees
   memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
   ringGraph->id = 0;
@@ -889,7 +2427,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
   }
-  timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
 
   // Initialize num P2P LL buffers for this communicator
   comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
@@ -899,66 +2436,96 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
   }
 
-  // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
-  // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
-  timers[TIMER_INIT_CONNECT] = clockNano();
-  // AllGather3 - begin
-  NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
-
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-    allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
-    allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
-    allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
-    allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
-    allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
-    allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
-    allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
-    allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
-  }
-
-  allGather3Data[rank].cpuArch = comm->cpuArch;
-  allGather3Data[rank].cpuVendor = comm->cpuVendor;
+  // Original AllGather3 - begin
 
+  struct ncclTopoRanks myTopoRanks;
+  struct ncclTopoRanks *peerTopoRanks;
   comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
-  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
-
-  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &myTopoRanks), ret, fail);
 
-  // Determine nNodes, firstRanks, ...
+  // Determine nNodes, nodesFirstRank, rankToNode, cpuArch, cpuVendor
+  // TODO: determine nodesTreePatterns
   NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, nranks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
-  for (int r=0; r<nranks; r++) {
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nodeVis, comm->nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&firstRankVis, comm->nRanks), ret, fail);
+  if (comm->cpuArch != peerComm->cpuArch &&
+      comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+    // If peerComm specifies a cpuArch, all ranks must shares the same cpuArch;
+    // otherwise, the communicator has a mixed cpuArch.
+    comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+  }
+  if (comm->cpuVendor != peerComm->cpuVendor &&
+      comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+    // Same to cpuArch.
+    comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+  }
+      INFO(NCCL_INIT,"peerComm->nodesTreePatterns333: %p",peerComm->nodesTreePatterns);
+  // comm->nNodes = peerComm->nNodes;
+  // for (int r = 0; r < nranks; r++) {
+  //   if (r == nranks - 1) {
+  //     int firstRank = myTopoRanks.ringRecv[0];
+  //     if (!firstRankVis[rank]) {
+  //       nodesFirstRank[comm->nNodes++] = firstRank;
+  //     }
+  //   } else {
+  //     int node = peerComm->rankToNode[r];
+  //     comm->rankToNode[r] = node;
+  //     if (!nodeVis[node]) {
+  //       nodesFirstRank[node] = r;
+  //       nodeVis[node] = true;
+  //       firstRankVis[rank] = true;
+  //     }
+  //   }
+  // }
+  INFO(NCCL_INIT,"nNodes:%d,comm->nNodes:%d",nNodes,comm->nNodes);
+  comm->nNodes = peerComm->nNodes;
+  memcpy(nodesFirstRank, peerComm->nodesFirstRank, peerComm->nRanks * sizeof(int));//postset关键数据结构
+  INFO(NCCL_INIT,"peerComm->nodesTreePatterns444: %p",peerComm->nodesTreePatterns);
+  memcpy(comm->nodesFirstRank, nodesFirstRank, peerComm->nRanks * sizeof(int));//postset关键数据结构
+  INFO(NCCL_INIT,"peerComm->nodesTreePatterns666: %p",peerComm->nodesTreePatterns);
+  memcpy(nodesTreePatterns, peerComm->nodesTreePatterns, peerComm->nRanks * sizeof(int));//postset关键数据结构
+  memcpy(comm->nodesTreePatterns, nodesTreePatterns, peerComm->nRanks * sizeof(int));//postset关键数据结构
+  for (int r = 0; r < nranks; r++) {
     int node;
-    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
-    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
-    if (node == comm->nNodes) {
-      comm->nNodes++;
-      nodesFirstRank[node] = firstRank;
-      // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+    // 对于新rank(如rank5),需要单独处理
+    if (r == comm->rank) {
+        // 新rank的topoRanks.ringRecv[0]应为自身或根据拓扑确定
+        int firstRank = myTopoRanks.ringRecv[0];
+        
+        // 查找是否已有节点包含该firstRank
+        for (node = 0; node < comm->nNodes && comm->nodesFirstRank[node] != firstRank; node++);
+        
+        // 如果是新节点
+        if (node == comm->nNodes) {
+            // 添加新节点信息
+            comm->nNodes++;
+            nodesFirstRank[node] = firstRank;
+            comm->nodesFirstRank[node] = firstRank;
+            INFO(NCCL_INIT,"nodesFirstRank node %d firstRank %d",node,firstRank);
+            nodesTreePatterns[node] = comm->graphs[NCCL_ALGO_TREE].pattern;
+            comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+        }
+    } else {
+        // 对于已有rank(来自peerComm的rank0-4),直接使用peerComm的映射
+        node = peerComm->rankToNode[r];//就是复制24节点,一模一样//很关键的数据结构
     }
+    
     comm->rankToNode[r] = node;
-
-    if (comm->cpuArch != allGather3Data[r].cpuArch &&
-        comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
-      comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
-    }
-    if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
-        comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
-      comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
-    }
   }
+  //nodesTreePatterns[0] = 1;
 
   // Alert the user to the presence of mixed CPUs. In the past this has caused
   // locks in some collective routines. This may help debug issues in the future.
-  if (rank==0) {
-    if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
-      INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
-    }
-    if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
-      INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
-    }
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+    INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+  }
+  if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+    INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
   }
 
   // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
@@ -989,35 +2556,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
         rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
   if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
     WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
-         rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
-         comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+        rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+        comm->localRank, comm->localRanks, comm->localRankToRank[0]);
     ret = ncclInternalError;
     goto fail;
   }
 
   INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
-       comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+      comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
 
   nChannelsOrig = comm->nChannels;
   NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
-  for (int i=0; i<nranks; i++) {
-    allTopoRanks[i] = &allGather3Data[i].topoRanks;
-    // Make sure we align all ranks so that the tuning is consistent across ranks
-    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
-      graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
-      graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
-      graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
-      graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
-      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
-      graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
-    }
-    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+  NCCLCHECKGOTO(ncclCalloc(&peerTopoRanks, comm->nRanks-1),ret,fail);
+  for (int i=0; i<comm->nRanks-1; i++) {
+   peerTopoRanks[i] = peerComm->peerTopo[i];
+   //peerTopoRanks[i] = myTopoRanks;
+   allTopoRanks[i] = &(peerTopoRanks[i]);//也是这个函数结束释放,所以不用拷贝
+   comm->peerTopo[i] = peerTopoRanks[i];
+  }
+
+  // for()
+  allTopoRanks[comm->nRanks-1] = &myTopoRanks;
+  comm->peerTopo[comm->nRanks-1] = myTopoRanks;
+  // for(int i=0; i<comm->nRanks; i++) {
+  //   comm->peerTopo[i] = *allTopoRanks[i];
+  // }
+  for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) {
+    INFO(NCCL_INIT,"  before Algorithm %d: Pattern=%d, nChannels=%d, sameChannels=%d, "
+      "bwIntra=%.2f, bwInter=%.2f, typeIntra=%d, typeInter=%d, crossNic=%d\n",
+      a, 
+      graphs[a]->pattern, 
+      graphs[a]->nChannels, 
+      graphs[a]->sameChannels, 
+      graphs[a]->bwIntra, 
+      graphs[a]->bwInter, 
+      graphs[a]->typeIntra, 
+      graphs[a]->typeInter, 
+      graphs[a]->crossNic);
+    graphs[a]->nChannels = std::max(graphs[a]->nChannels, peerComm->graphs[a].nChannels); // only available in single node case
+    graphs[a]->sameChannels = std::max(graphs[a]->sameChannels, peerComm->graphs[a].sameChannels); // only available in single node case
+    graphs[a]->bwIntra = std::min(graphs[a]->bwIntra, peerComm->graphs[a].bwIntra);
+    graphs[a]->bwInter = std::min(graphs[a]->bwInter, peerComm->graphs[a].bwInter);
+    graphs[a]->typeIntra = std::max(graphs[a]->typeIntra, peerComm->graphs[a].typeIntra);
+    graphs[a]->typeInter = std::max(graphs[a]->typeInter, peerComm->graphs[a].typeInter);
+    graphs[a]->crossNic = std::max(graphs[a]->crossNic, peerComm->graphs[a].crossNic);
+    INFO(NCCL_INIT,"  Algorithm %d: Pattern=%d, nChannels=%d, sameChannels=%d, "
+      "bwIntra=%.2f, bwInter=%.2f, typeIntra=%d, typeInter=%d, crossNic=%d\n",
+      a, 
+      graphs[a]->pattern, 
+      graphs[a]->nChannels, 
+      graphs[a]->sameChannels, 
+      graphs[a]->bwIntra, 
+      graphs[a]->bwInter, 
+      graphs[a]->typeIntra, 
+      graphs[a]->typeInter, 
+      graphs[a]->crossNic);
   }
   if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
 
+  // Set allTopoRanks for single node case
+
+  comm->nChannels = 1;
+  // for (int r = 0; r < nranks; r++) {
+  //   NCCLCHECKGOTO(ncclCalloc(&allTopoRanks[r], 1), ret, fail);
+  //   for (int c = 0; c < comm->nChannels; c++) {
+  //       allTopoRanks[r]->ringRecv[c] = 0;
+  //       allTopoRanks[r]->ringSend[c] = nranks - 1;
+  //       allTopoRanks[r]->ringPrev[c] = r - 1;
+  //       allTopoRanks[r]->ringNext[c] = r + 1 < nranks ? r + 1 : -1;
+  //       allTopoRanks[r]->treeToChild0[c] = 1;
+  //       allTopoRanks[r]->treeToChild1[c] = 1;
+  //   }
+  // }
+
   comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+  comm->nChannels = 1;
   if (comm->nChannels < nChannelsOrig) {
     // We started duplicating channels during Preset(), so we need to move the
     // duplicated channels since we have removed some.
@@ -1031,14 +2645,26 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
       comm->config.collnetEnable = 0;
     }
+    // // As long as there is more than 1 rank on any node, we need to disable collnet reg
   }
   NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
   comm->isOneRPN = (comm->maxLocalRanks == 1);
 
-  NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
-  // AllGather3 - end
-  timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+  NCCLCHECKGOTO(ncclCalloc(&rings, nranks* MAXCHANNELS), ret, fail);
+  //comm->nChannels = 4;
+  INFO(NCCL_INIT, "111Connecting rings %d", comm->nChannels);
+  //struct ncclChannel* channel1 = comm->channels + 0;
+  INFO(NCCL_INIT, "end Connecting channel Id  prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nullptr), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->peerRings, comm->nRanks * MAXCHANNELS), ret, fail);
+  for (int r = 0; r < nranks; r++) {
+    for (int c = 0; c < MAXCHANNELS; c++) {
+        int src_idx = r * MAXCHANNELS + c;
+        int dst_idx = r * MAXCHANNELS + c;
+        comm->peerRings[dst_idx] = rings[src_idx];
+    }
+  }
+  // Original AllGather3 - end
 
   TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
 
@@ -1048,7 +2674,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     struct ncclTree* tree = &comm->channels[c].tree;
     snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
         c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
-    INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+    INFO(NCCL_GRAPH, "Ring1 %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
   }
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s", line);
@@ -1057,10 +2683,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
 
   // Compute nChannels per peer for p2p
   NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
-
+  INFO(NCCL_INIT, "end Connecting channel Id  prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
   /* until now, all info of comm should be known. We can initialize shared resources and
-   * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
-   * all proxy operations. */
+  * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+  * all proxy operations. */
   if (comm->sharedRes->owner == comm) {
     comm->sharedRes->tpNLocalRanks = comm->localRanks;
     comm->sharedRes->magic = comm->magic;
@@ -1068,6 +2694,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
     memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
   }
+  // INFO(NCCL_INIT, "end Connecting channel Id  prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
   NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
   for (int i = 0; i < comm->localRanks; ++i) {
     int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
@@ -1075,20 +2702,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
   comm->topParentLocalRanks = topParentLocalRanks;
 
-  // Profiler plugin context has to be initialized before proxy thread
-  NCCLCHECK(ncclProfilerPluginInit(comm));
-
   NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
   // Launch proxy service thread, after this, the proxy calls can be used.
-  if (parent && parent->shareResources) {
-    comm->proxyState = parent->sharedRes->proxyState;
-    ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
-  } else {
-    NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
-  }
+  NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
-
-  timers[TIMER_INIT_CONNECT] = clockNano();
+  INFO(NCCL_INIT, "end Connecting channel Id  prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
   do { // Build p2p schedule
     int node = comm->node;
     int nNodes = comm->nNodes;
@@ -1145,136 +2763,48 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
   } while (0);
 
-  comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
-  if (comm->runtimeConn) {
-    for (int c=0; c<comm->nChannels; c++) {
-      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
-    }
-    // Attempt to setup NVLS, may silently fail and disable NVLS
-    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
-    // Check if we can setup CollNet
-    if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
-  } else {
-    for (int c=0; c<comm->nChannels; c++) {
-      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
-    }
-    NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
-
-    // Connect Trees
-    NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
-
-    // Connect PAT only for communicators with 1 GPU per node
-    if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
-
-    // Attempt to setup NVLS, may silently fail and disable NVLS
-    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
-
-    // And NVLS trees if needed
-    NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
-
-    // Check if we can setup CollNet
-    if (comm->config.collnetEnable) {
-      ncclCollNetSetup(comm, parent, graphs);
-      NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
-      if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
-        NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
-      }
-    }
-
-    // Connect to local net proxy
-    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
-    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
-
-    // Then to remote ones when using PXN
-    if (ncclPxnDisable(comm) == 0) {
-      int nranks;
-      NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
-      for (int r=0; r<nranks; r++) {
-        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
-        NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
-      }
-    }
-
-    if (ncclParamNvbPreconnect()) {
-      // Connect p2p when using NVB path
-      int nvbNpeers;
-      NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
-      for (int r=0; r<nvbNpeers; r++) {
-        int peer = nvbPeers[r];
-        int sendRound=0, recvRound=0;
-        while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
-        while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
-        uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
-        uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
-        for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-          int channelId;
-          channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
-          if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
-            comm->connectSend[peer] |= (1UL<<channelId);
-          }
-          channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
-          if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
-            comm->connectRecv[peer] |= (1UL<<channelId);
-          }
-        }
-      }
-
-      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
-    }
-  }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
-
-  // Compute time models for algorithm and protocol combinations
-  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
-
-  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
-
-  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
-    const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
-    enum ncclLaunchMode mode, modeOld;
-    if (str && strcasecmp(str, "GROUP") == 0) {
-      mode = ncclLaunchModeGroup;
-    } else {
-      mode = ncclLaunchModeParallel;
-    }
-    // In theory we could be racing with other communicators not associated with
-    // this one if the user is connecting to multiple ncclUniqueId's concurrently.
-    modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
-    if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
-      INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
-    }
+  //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+  comm->runtimeConn =  1;
+  for (int c = 0; c < comm->nChannels; c++) {
+    //NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings + c * nranks), ret, fail);
+    //NCCLCHECKGOTO(setupChannel(peerComm, comm, c, rank, nranks, rings + c * nranks), ret, fail);
+    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+  }
+  // // Setup NVLS
+  // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+  // // Check if we can setup CollNet
+  // if (comm->collNetSupport > 0) ncclCollNetSetup(comm, NULL, graphs);
+  // Attempt to setup NVLS, may silently fail and disable NVLS
+  NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+  // Check if we can setup CollNet
+  if (comm->config.collnetEnable) ncclCollNetSetup(comm, NULL, graphs);
+  INFO(NCCL_INIT, "end Connecting channel Id  prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
+  // for (int c = 0; c < comm->nChannels; c++) {
+  //   struct ncclChannel* channel = comm->channels + c;
+  //   NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
+  // }
+
+  for (int i = 1; i < comm->nRanks; i++) {
+    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+    int sendPeer = (comm->rank + i) % comm->nRanks;
+    uint64_t recvMask = comm->connectRecv[recvPeer];
+    uint64_t sendMask = comm->connectSend[sendPeer];
+    //INFO(NCCL_INIT,"十六进制(小写): 0x%" PRIx64 "\n", recvMask);
+    INFO(NCCL_INIT, "first i %d:两个十六进制值: 0x%" PRIx64 " 0x%" PRIx64 "\n", i, recvMask, sendMask);
   }
 
-  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
-  comm->baseStride = 0;
-
-  // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
-  // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
-  NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
-
-  timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
-  /* Local intra-node barrier */
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-
-  // We should have allocated all buffers, collective fifos, ... we can
-  // restore the affinity.
-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
-
 exit:
   if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
-   * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
-   * properly cleaned up. */
-  if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
-  free(allTopoRanks);
-  free(nodesTreePatterns);
-  free(nodesFirstRank);
-  free(allGather3Data);
-  free(rings);
-  free(nvbPeers);
-  free(pxnPeers);
+  * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+  * properly cleaned up. */
+  if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+  if (allTopoRanks) free(allTopoRanks);
+  if (nodesTreePatterns) free(nodesTreePatterns);
+  if (nodesFirstRank) free(nodesFirstRank);
+  if (rings) free(rings);
+  if (nodeVis) free(nodeVis);
+  if (firstRankVis) free(firstRankVis);
   return ret;
 fail:
   goto exit;
@@ -1426,7 +2956,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     eatHash(hacc, &job->color);
     comm->commHash = digestHash(hacc);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
+        comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
     NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1439,7 +2969,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     // obtain a unique hash using the first commId
     comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
+        comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
     NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1462,23 +2992,23 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
     TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
+        comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
   } else {
     // the name for the replay tool is ncclCommInitRank for all the variations
     TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
+        comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
   }
   sum_timers = 0.0;
   for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
     sum_timers += (timers[it] / 1e9);
   INFO(NCCL_INIT | NCCL_PROFILE,
-       "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
-       "connections %.2f, rest %.2f)",
-       job->funcName, comm->rank, comm->nRanks,
-       timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
-       timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
-       timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
+      "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
+      "connections %.2f, rest %.2f)",
+      job->funcName, comm->rank, comm->nRanks,
+      timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
+      timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
+      timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
 exit:
   if (job->newcomm) {
     /* assign it to user pointer. */
@@ -1626,7 +3156,7 @@ static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) {
   return ncclSuccess;
 }
 
-static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   ncclResult_t ret = ncclSuccess;
   /* config must not be NULL in this function */
   ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
@@ -1817,6 +3347,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   // copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
   NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
   memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
+  
 
   commIdEnv = ncclGetEnv("NCCL_COMM_ID");
   if (commIdEnv && myrank == 0) {
@@ -1859,7 +3390,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 
   NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
     NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
-
+  printf("ncclCommInitRank magic %" PRIu64 "\n", (*newcomm)->magic);
   return ncclSuccess;
 }
 
@@ -2121,7 +3652,7 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
       ncclComm_t nextIntraComm = intracomm0;
 
       /* this is  the last call to ncclCommDestroy/Abort, we need to make sure all comms
-       * in the process have been finalized before we free local resources. */
+      * in the process have been finalized before we free local resources. */
       while (nextIntraComm) {
         curIntraComm = nextIntraComm;
         curRank = curIntraComm->rank;
@@ -2215,7 +3746,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NCCLCHECK(setCommAbortFlags(comm,1));
   comm->destroyFlag = 1;
   /* init thread must be joined before we destroy the comm,
-   * and we should ignore the init error here. */
+  * and we should ignore the init error here. */
   (void)ncclCommEnsureReady(comm);
 
   // once the comm is ready, we can access ranks etc
@@ -2352,12 +3883,17 @@ ncclResult_t  ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int exclude
     NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
   }
   NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit);
+  // INFO(NCCL_INIT,"ncclCommShrink magic %d", (*newcomm)->magic);
+  // printf("ncclCommShrink point %p\n", (*newcomm));
+  // printf("ncclCommShrink magic %" PRIu64 "\n", (*newcomm)->magic);
 
   if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount));
 
 exit:
   (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
+  printf("ncclCommShrink magic %" PRIu64 "\n", (*newcomm)->magic);
+  // printf("ncclCommShrink point %p\n", (*newcomm));
   return res;
 }
 
@@ -2367,7 +3903,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
 
   ncclResult_t res = ncclSuccess;
   NCCLCHECK(ncclGroupStartInternal());
-  NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);
+  NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);// 
 
   if (*newcomm)
     NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
@@ -2394,8 +3930,8 @@ const char* ncclGetErrorString(ncclResult_t code) {
 }
 
 /* Returns a human-readable message of the last error that occurred.
- * comm is currently unused and can be set to NULL
- */
+* comm is currently unused and can be set to NULL
+*/
 NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
 const char* ncclGetLastError(ncclComm_t comm) {
   return ncclLastError;
@@ -2436,7 +3972,6 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
 
   NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
   NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-
   NCCLCHECK(ncclCommEnsureReady(comm));
 
   *devid = comm->cudaDev;
diff --git a/src/lighthouse.cc b/src/lighthouse.cc
new file mode 100644
index 0000000..49e82db
--- /dev/null
+++ b/src/lighthouse.cc
@@ -0,0 +1,339 @@
+#include "lighthouse.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define LH_STATE_OK     0
+#define LH_STATE_EMPTY  1
+#define LH_STATE_ERROR  -1
+
+struct LhTxn {
+    FILE* fp;
+    int fd;
+    int writable;
+};
+
+struct LhRank {
+    uint32_t rank_id;
+    union ncclSocketAddress addr;
+};
+
+struct LhState {
+    uint64_t version;
+    uint32_t nranks;
+    uint64_t magic;
+
+    struct LhRank first_rank;
+    struct LhRank last_rank;
+    struct LhRank new_rank;
+};
+
+static int lockFd(int fd, short type) {
+    struct flock fl;
+    memset(&fl, 0, sizeof(fl));
+    fl.l_type = type;
+    fl.l_whence = SEEK_SET;
+    fl.l_start = 0;
+    fl.l_len = 0;
+
+    return fcntl(fd, F_SETLKW, &fl);
+}
+
+static int unlockFd(int fd) {
+    struct flock fl;
+    memset(&fl, 0, sizeof(fl));
+    fl.l_type = F_UNLCK;
+    fl.l_whence = SEEK_SET;
+    fl.l_start = 0;
+    fl.l_len = 0;
+
+    return fcntl(fd, F_SETLK, &fl);
+}
+
+static int saveLhState(FILE* fp, const struct LhState* state) {
+    if (!fp) {
+        fprintf(stderr,
+            "Invalid file pointer for saving lighthouse state\n");
+        return LH_STATE_ERROR;
+    }
+
+    if (!fwrite(&state->version, sizeof(state->version), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state version\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fwrite(&state->nranks, sizeof(state->nranks), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state nranks\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fwrite(&state->magic, sizeof(state->magic), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state magic\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fwrite(&state->first_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state first_rank\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fwrite(&state->last_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state last_rank\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fwrite(&state->new_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to write lighthouse state new_rank\n");
+        return LH_STATE_ERROR;
+    }
+
+    return LH_STATE_OK;
+}
+
+static int loadLhState(FILE* fp, struct LhState* state) {
+    if (!fp) {
+        fprintf(stderr,
+            "Invalid file pointer for loading lighthouse state\n");
+        return LH_STATE_ERROR;
+    }
+
+    size_t n = fread(&state->version, sizeof(state->version), 1, fp);
+    if (n == 0) {
+        if (feof(fp)) {
+            memset(state, 0, sizeof(struct LhState));
+            return LH_STATE_EMPTY;
+        }
+        fprintf(stderr, "Failed to read lighthouse state version\n");
+        return LH_STATE_ERROR;
+    }
+
+    if (!fread(&state->nranks, sizeof(state->nranks), 1, fp)) {
+        fprintf(stderr, "Failed to read lighthouse state nranks\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fread(&state->magic, sizeof(state->magic), 1, fp)) {
+        fprintf(stderr, "Failed to read lighthouse state magic\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fread(&state->first_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to read lighthouse state first_rank\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fread(&state->last_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to read lighthouse state last_rank\n");
+        return LH_STATE_ERROR;
+    }
+    if (!fread(&state->new_rank, sizeof(struct LhRank), 1, fp)) {
+        fprintf(stderr, "Failed to read lighthouse state new_rank\n");
+        return LH_STATE_ERROR;
+    }
+
+    return LH_STATE_OK;
+}
+
+int txnWaitForVersion(const char* path, uint64_t expected_version, int timeout_ms) {
+    const int sleep_interval_us = 100 * 1000;
+    int waited_ms = 0;
+
+    while (1) {
+        struct LhTxn* lhTxn = NULL;
+        struct LhState* lhState = NULL;
+        int ret = txnBegin(path, 0, &lhTxn);
+        if (ret != LH_STATE_OK) {
+            fprintf(stderr, "lighthouse: txnBegin failed");
+            return ret;
+        }
+        ret = txnLoad(lhTxn, &lhState);
+        if (ret != LH_STATE_OK) {
+            fprintf(stderr, "lighthouse: txnLoad failed");
+            txnEnd(lhTxn);
+            return ret;
+        }
+
+        uint64_t version = lhState->version;
+
+        txnEnd(lhTxn);
+        free(lhState);
+
+        if (version == expected_version) {
+            return LH_STATE_OK;
+        }
+
+        if (timeout_ms >= 0 && waited_ms >= timeout_ms) {
+            fprintf(stderr, "lighthouse: timeout waiting for version %lu\n", expected_version);
+            return LH_STATE_ERROR;
+        }
+
+        usleep(sleep_interval_us);
+        waited_ms += sleep_interval_us / 1000;
+    }
+}
+
+int txnBegin(const char* path, int write, struct LhTxn** out) {
+    struct LhTxn* txn = (struct LhTxn*)malloc(sizeof(*txn));
+    if (!txn) {
+        return LH_STATE_ERROR;
+    }
+
+    txn->writable = write;
+    txn->fp = fopen(path, write ? "r+b" : "rb");
+    if (!txn->fp && write) {
+        txn->fp = fopen(path, "w+b");
+    }
+    if (!txn->fp) {
+        return LH_STATE_ERROR;
+    }
+
+    txn->fd = fileno(txn->fp);
+
+    if (lockFd(txn->fd, write ? F_WRLCK : F_RDLCK) < 0) {
+        fclose(txn->fp);
+        return LH_STATE_ERROR;
+    }
+
+    *out = txn;
+    return LH_STATE_OK;
+}
+
+int txnLoad(struct LhTxn* txn, struct LhState** out) {
+    rewind(txn->fp);
+
+    struct LhState* state = (struct LhState*)malloc(sizeof(struct LhState));
+    if (!state) {
+        return LH_STATE_ERROR;
+    }
+
+    if (loadLhState(txn->fp, state) == LH_STATE_ERROR) {
+        return LH_STATE_ERROR;
+    }
+
+    *out = state;
+    return LH_STATE_OK;
+}
+
+int txnSave(struct LhTxn* txn, const struct LhState* state) {
+    if (!txn->writable) {
+        errno = EPERM;
+        return LH_STATE_ERROR;
+    }
+
+    rewind(txn->fp);
+
+    if (saveLhState(txn->fp, state) < 0) {
+        return LH_STATE_ERROR;
+    }
+
+    fflush(txn->fp);
+    fsync(txn->fd);
+    return LH_STATE_OK;
+}
+
+int txnEnd(struct LhTxn* txn) {
+    int ret = LH_STATE_OK;
+
+    if (txn->fp) {
+        if (unlockFd(txn->fd) < 0) {
+            ret = LH_STATE_ERROR;
+        }
+        fclose(txn->fp);
+    }
+
+    memset(txn, 0, sizeof(*txn));
+    return ret;
+}
+
+int initialize(struct LhState* state, const union ncclSocketAddress* src_addrs, int nranks, uint64_t magic) {
+    memset(state, 0, sizeof(struct LhState));
+
+    state->version = 1;
+    state->nranks = nranks;
+    state->magic = magic;
+
+    state->first_rank.rank_id = 0;
+    memcpy(&state->first_rank.addr, &src_addrs[0], sizeof(union ncclSocketAddress));
+
+    state->last_rank.rank_id = nranks - 1;
+    memcpy(&state->last_rank.addr, &src_addrs[nranks - 1], sizeof(union ncclSocketAddress));
+
+    return LH_STATE_OK;
+}
+
+void setMagic(struct LhState* state, uint64_t magic)
+{
+    state->magic = magic;
+}
+
+int setFirstRank(struct LhState* state, const union ncclSocketAddress* firstRankNcclAddr, uint32_t rank, uint32_t nranks)
+{
+    state->first_rank.rank_id = rank;
+    state->nranks = nranks;
+    memcpy(&state->first_rank.addr, firstRankNcclAddr, sizeof(union ncclSocketAddress));
+    return LH_STATE_OK;
+}
+
+int setLastRank(struct LhState* state, const union ncclSocketAddress* lastRankNcclAddr, uint32_t rank, uint32_t nranks)
+{
+    state->last_rank.rank_id = rank;
+    state->nranks = nranks;
+    memcpy(&state->last_rank.addr, lastRankNcclAddr, sizeof(union ncclSocketAddress));
+    return LH_STATE_OK;
+}
+
+int setNewRank(struct LhState* state, const union ncclSocketAddress* newRankNcclAddr, uint32_t rank)
+{
+    state->new_rank.rank_id = rank;
+    memcpy(&state->new_rank.addr, newRankNcclAddr, sizeof(union ncclSocketAddress));
+    return LH_STATE_OK;
+}
+
+void updateLastRankAddr(struct LhState* state)
+{
+    state->last_rank = state->new_rank;
+}
+
+void updateVersion(struct LhState* state)
+{
+    state->version++;
+}
+
+void getMagic(const struct LhState* state, uint64_t* magic)
+{
+    *magic = state->magic;
+}
+
+void getVersion(const struct LhState* state, uint64_t* version)
+{
+    *version = state->version;
+}
+
+int queryNextRankAddrNew(const struct LhState* state, union ncclSocketAddress* nextAddr)
+{
+    memcpy(nextAddr, &state->first_rank.addr, sizeof(union ncclSocketAddress));
+    return LH_STATE_OK;
+}
+
+int queryNextRankAddrLast(const struct LhState* state, union ncclSocketAddress* nextAddr)
+{
+    memcpy(nextAddr, &state->new_rank.addr, sizeof(union ncclSocketAddress));
+    return LH_STATE_OK;
+}
+
+void printLhState(const struct LhState* state)
+{
+    printf("Lighthouse State:\n");
+    printf("  Version: %lu\n", state->version);
+    printf("  Nranks: %u\n", state->nranks);
+    printf("  Magic: %lu\n", state->magic);
+    printf("  First Rank ID: %u\n", state->first_rank.rank_id);
+    printf("    Address Family: %u\n", state->first_rank.addr.sin.sin_family);
+    printf("    Port: %u\n", ntohs(state->first_rank.addr.sin.sin_port));
+    printf("    Address: %x\n", state->first_rank.addr.sin.sin_addr.s_addr);
+    printf("  Last Rank ID: %u\n", state->last_rank.rank_id);
+    printf("    Address Family: %u\n", state->last_rank.addr.sin.sin_family);
+    printf("    Port: %u\n", ntohs(state->last_rank.addr.sin.sin_port));
+    printf("    Address: %x\n", state->last_rank.addr.sin.sin_addr.s_addr);
+    printf("  New Rank ID: %u\n", state->new_rank.rank_id);
+    printf("    Address Family: %u\n", state->new_rank.addr.sin.sin_family);
+    printf("    Port: %u\n", ntohs(state->new_rank.addr.sin.sin_port));
+    printf("    Address: %x\n", state->new_rank.addr.sin.sin_addr.s_addr);
+}
\ No newline at end of file
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index eb9cd10..f802eaa 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -101,17 +101,17 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
     goto fail;
   }
 
-  if (create) {
-    *(int*)(hptr + shmSize) = refcount;
-  } else {
-    int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
-    if (remref == 0) {
-      /* the last peer has completed attachment, it should unlink the shm mem file. */
-      if (unlink(shmPath) != 0) {
-        INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
-      }
-    }
-  }
+  // if (create) {
+  //   *(int*)(hptr + shmSize) = refcount;
+  // } else {
+  //   int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
+  //   if (remref == 0) {
+  //     /* the last peer has completed attachment, it should unlink the shm mem file. */
+  //     if (unlink(shmPath) != 0) {
+  //       INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
+  //     }
+  //   }
+  // }
 
   if (devShmPtr) {
     CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index d066d28..395e746 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -4,981 +4,985 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "socket.h"
-#include "utils.h"
-#include <stdlib.h>
-
-#include <unistd.h>
-#include <ifaddrs.h>
-#include <net/if.h>
-#include "param.h"
-#include <time.h>
-
-NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
-NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
-static void msleep(unsigned int time_msec) {
-  const long c_1e6 = 1e6;
-  struct timespec tv = (struct timespec){
-      .tv_sec = time_msec / 1000,
-      .tv_nsec = (time_msec % 1000) * c_1e6,
-  };
-  nanosleep(&tv, NULL);
-}
-
-static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
-  int bytes = 0;
-  *closed = 0;
-  char* data = (char*)ptr;
-  char line[SOCKET_NAME_MAXLEN+1];
-  do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
-    if (op == NCCL_SOCKET_RECV && bytes == 0) {
-      *closed = 1;
-      return ncclSuccess;
-    }
-    if (bytes == -1) {
-      if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
-        *closed = 1;
-        return ncclSuccess;
-      }
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
-             ncclSocketToString(&sock->addr, line), strerror(errno));
-        return ncclRemoteError;
-      } else {
-        bytes = 0;
-      }
-    }
-    (*offset) += bytes;
-    if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) {
-      INFO(NCCL_NET, "socketProgressOpt: abort called");
-      return ncclInternalError;
-    }
-  } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
-  return ncclSuccess;
-}
-
-static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
-  int closed;
-  NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
-  if (closed) {
-    if (pclosed) {
-      *pclosed = closed;
-      return ncclSuccess;
-    } else {
-      char line[SOCKET_NAME_MAXLEN+1];
-      WARN("socketProgress: Connection closed by remote peer %s",
-           ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
-      return ncclRemoteError;
-    }
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
-  while (*offset < size)
-    NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
-  return ncclSuccess;
-}
-
-/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
- *
- * Output: "IPv4/IPv6 address<port>"
- */
-const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
-  const struct sockaddr *saddr;
-  char host[NI_MAXHOST], service[NI_MAXSERV];
-  int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
-  if (buf == NULL || addr == NULL) goto fail;
-  saddr = &addr->sa;
-  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
-  /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
-   * (When not set, this will still happen in case the node's name cannot be determined.)
-   */
-  if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
-  sprintf(buf, "%s<%s>", host, service);
-  return buf;
-fail:
-  if (buf)
-    buf[0] = '\0';
-  return buf;
-}
-
-static uint16_t socketToPort(union ncclSocketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
-  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
-}
-
-/* Allow the user to force the IPv4/IPv6 interface selection */
-static int envSocketFamily(void) {
-  int family = -1; // Family selection is not forced, will use first one found
-  const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
-  if (env == NULL)
-    return family;
-
-  INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
-
-  if (strcmp(env, "AF_INET") == 0)
-    family = AF_INET;  // IPv4
-  else if (strcmp(env, "AF_INET6") == 0)
-    family = AF_INET6; // IPv6
-  return family;
-}
-
-static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
-                                   int maxIfNameSize, int maxIfs, int* found) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-  struct netIf userIfs[MAX_IFS];
-  bool searchNot = prefixList && prefixList[0] == '^';
-  if (searchNot) prefixList++;
-  bool searchExact = prefixList && prefixList[0] == '=';
-  if (searchExact) prefixList++;
-  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
-
-  *found = 0;
-  struct ifaddrs *interfaces, *interface;
-  SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
-  for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
-
-    /* Allow the caller to force the socket family type */
-    if (sock_family != -1 && family != sock_family)
-      continue;
-
-    /* We also need to skip IPv6 loopback interfaces */
-    if (family == AF_INET6) {
-      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
-      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
-    }
-
-    // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
-      continue;
-    }
-
-    // Check that this interface has not already been saved
-    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
-    bool duplicate = false;
-    for (int i = 0; i < *found; i++) {
-      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
-    }
-
-    if (!duplicate) {
-      // Store the interface name
-      strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
-      // Store the IP address
-      int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-      memset(addrs + *found, '\0', sizeof(*addrs));
-      memcpy(addrs + *found, interface->ifa_addr, salen);
-      (*found)++;
-    }
-  }
-
-  freeifaddrs(interfaces);
-  return ncclSuccess;
-}
-
-static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
-  /* Check family first */
-  int family = local_if.ifa_addr->sa_family;
-  if (family != remote->sa.sa_family) {
-    return false;
-  }
-
-  if (family == AF_INET) {
-    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
-    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
-    struct sockaddr_in& remote_addr = remote->sin;
-    struct in_addr local_subnet, remote_subnet;
-    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
-    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
-    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
-  } else if (family == AF_INET6) {
-    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
-    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
-    struct sockaddr_in6& remote_addr = remote->sin6;
-    struct in6_addr& local_in6 = local_addr->sin6_addr;
-    struct in6_addr& mask_in6 = mask->sin6_addr;
-    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
-    bool same = true;
-    int len = 16;  //IPv6 address is 16 unsigned char
-    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
-      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      if (c1 ^ c2) {
-        same = false;
-        break;
-      }
-    }
-    // At last, we need to compare scope id
-    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
-    // For Global type, this field is 0, so a comparison wouldn't matter
-    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
-    return same;
-  } else {
-    INFO(NCCL_NET, "Net : Unsupported address family type");
-    return false;
-  }
-}
-
-ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
-                                          union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-  char line_a[SOCKET_NAME_MAXLEN+1];
-#endif
-  *found = 0;
-  struct ifaddrs *interfaces, *interface;
-  SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
-  for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    // check against user specified interfaces
-    if (!matchSubnet(*interface, remoteAddr)) {
-      continue;
-    }
-
-    // Store the local IP address
-    int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-    memcpy(localAddr, interface->ifa_addr, salen);
-
-    // Store the interface name
-    strncpy(ifName, interface->ifa_name, ifNameMaxSize);
-
-    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
-          interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
-    *found = 1;
-  }
-
-  freeifaddrs(interfaces);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
-  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
-    WARN("Net : string is null");
-    return ncclInvalidArgument;
-  }
-
-  bool ipv6 = ip_port_pair[0] == '[';
-  /* Construct the sockaddress structure */
-  if (!ipv6) {
-    struct netIf ni;
-    // parse <ip_or_hostname>:<port> string, expect one pair
-    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
-      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
-      return ncclInvalidArgument;
-    }
-
-    struct addrinfo hints, *p;
-    int rv;
-    memset(&hints, 0, sizeof(hints));
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_socktype = SOCK_STREAM;
-
-    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
-      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
-      return ncclInvalidArgument;
-    }
-
-    // use the first
-    if (p->ai_family == AF_INET) {
-      struct sockaddr_in& sin = ua->sin;
-      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
-      sin.sin_family = AF_INET;                        // IPv4
-      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
-      sin.sin_port = htons(ni.port);                   // port
-    } else if (p->ai_family == AF_INET6) {
-      struct sockaddr_in6& sin6 = ua->sin6;
-      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
-      sin6.sin6_family = AF_INET6;                     // IPv6
-      sin6.sin6_port = htons(ni.port);                 // port
-      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
-      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
-    } else {
-      WARN("Net : unsupported IP family");
-      freeaddrinfo(p);
-      return ncclInvalidArgument;
-    }
-
-    freeaddrinfo(p); // all done with this structure
-
-  } else {
-    int i, j = -1, len = strlen(ip_port_pair);
-    for (i = 1; i < len; i++) {
-      if (ip_port_pair[i] == '%') j = i;
-      if (ip_port_pair[i] == ']') break;
-    }
-    if (i == len) {
-      WARN("Net : No valid [IPv6]:port pair found");
-      return ncclInvalidArgument;
-    }
-    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
-
-    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
-    memset(ip_str, '\0', sizeof(ip_str));
-    memset(port_str, '\0', sizeof(port_str));
-    memset(if_name, '\0', sizeof(if_name));
-    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
-    strncpy(port_str, ip_port_pair+i+2, len-i-1);
-    int port = atoi(port_str);
-    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
-
-    struct sockaddr_in6& sin6 = ua->sin6;
-    sin6.sin6_family = AF_INET6;                       // IPv6
-    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
-    sin6.sin6_port = htons(port);                      // port
-    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
-    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
-                                int* nIfs) {
-  static int shownIfName = 0;
-  // Allow user to force the INET socket family selection
-  int sock_family = envSocketFamily();
-  // User specified interface
-  const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
-  *nIfs = 0;
-  if (env && strlen(env) > 1) {
-    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
-    // Specified by user : find or fail
-    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
-    NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
-  } else {
-    // Try to automatically pick the right one
-    // Start with IB
-    NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
-    // else see if we can get some hint from COMM ID
-    if (*nIfs == 0) {
-      const char* commId = ncclGetEnv("NCCL_COMM_ID");
-      if (commId && strlen(commId) > 1) {
-        INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
-        // Try to find interface that is in the same subnet as the IP in comm id
-        union ncclSocketAddress idAddr;
-        NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
-        NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
-      }
-    }
-    // Then look for anything else (but not docker or lo)
-    if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
-    // Finally look for docker, then lo.
-    if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
-    if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
-  if (sock == NULL) {
-    WARN("ncclSocketListen: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (sock->fd == -1) {
-    WARN("ncclSocketListen: file descriptor is -1");
-    return ncclInvalidArgument;
-  }
-
-  if (socketToPort(&sock->addr)) {
-    // Port is forced by env. Make sure we get the port.
-    int opt = 1;
-    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#endif
-  }
-
-  // addr port should be 0 (Any port)
-  SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
-
-  /* Get the assigned Port */
-  socklen_t size = sock->salen;
-  SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
-
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
-#endif
-
-  /* Put the socket in listen mode
-   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
-   */
-  SYSCHECK(listen(sock->fd, 16384), "listen");
-  sock->state = ncclSocketStateReady;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
-  if (sock == NULL) {
-    WARN("ncclSocketGetAddr: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (sock->state != ncclSocketStateReady) return ncclInternalError;
-  memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
-  socklen_t socklen = sizeof(union ncclSocketAddress);
-  sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
-  if (sock->fd != -1) {
-    sock->state = ncclSocketStateAccepted;
-  } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
-             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH ||
-             errno == EINTR) {
-    /* per accept's man page, for linux sockets, the following errors might be already pending errors
-     * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
-    if (++sock->errorRetries == ncclParamRetryCnt()) {
-      WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
-      return ncclSystemError;
-    }
-    INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
-  } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
-    WARN("socketTryAccept: Accept failed: %s", strerror(errno));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
-NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
-
-static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
-  const int one = 1;
-  /* Set socket as non-blocking if async or if we need to be able to abort */
-  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
-    int flags;
-    SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
-    SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-  }
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
-  // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
-  int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
-  if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
-  if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
-  return ncclSuccess;
-}
-
-static void socketResetAccept(struct ncclSocket* sock) {
-  char line[SOCKET_NAME_MAXLEN+1];
-  INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
-       ncclSocketToString(&sock->addr, line));
-  // Ignore spurious connection and accept again
-  (void)close(sock->fd);
-  sock->fd = -1;
-  sock->state = ncclSocketStateAccepting;
-  sock->finalizeCounter = 0;
-}
-
-static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
-  uint64_t magic;
-  enum ncclSocketType type;
-  int received;
-  char line[SOCKET_NAME_MAXLEN+1];
-  // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
-  NCCLCHECK(socketSetFlags(sock));
-
-  if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
-    if (sock->asyncFlag == 0) {
-      received = 0;
-      if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
-        socketResetAccept(sock);
-        return ncclSuccess;
-      }
-    } else {
-      int closed = 0;
-      received = sock->finalizeCounter;
-      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
-      sock->finalizeCounter = received;
-      if (received < sizeof(magic)) {
-        if (closed) {
-          socketResetAccept(sock);
-        }
-        return ncclSuccess;
-      }
-      memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
-    }
-    if (magic != sock->magic) {
-      socketResetAccept(sock);
-      return ncclSuccess;
-    }
-  }
-  if (sock->asyncFlag == 0) {
-    received = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
-  } else {
-    received = sock->finalizeCounter - sizeof(magic);
-    NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
-    sock->finalizeCounter = received + sizeof(magic);
-    if (received < sizeof(type)) return ncclSuccess;
-    memcpy(&type, sock->finalizeBuffer, sizeof(type));
-  }
-  if (type != sock->type) {
-    WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
-    sock->state = ncclSocketStateError;
-    close(sock->fd);
-    sock->fd = -1;
-    return ncclInternalError;
-  } else {
-    sock->state = ncclSocketStateReady;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t socketResetFd(struct ncclSocket* sock) {
-  ncclResult_t ret = ncclSuccess;
-  int fd = -1;
-  SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
-  // if sock->fd is valid, close it and reuse its number
-  if (sock->fd != -1) {
-    SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
-    SYSCHECKGOTO(close(fd), "close", ret, cleanup);
-  } else {
-    sock->fd = fd;
-  }
-  NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
-exit:
-  return ret;
-cleanup:
-  // cleanup fd, leave sock->fd untouched
-  if (fd != -1) {
-    (void)close(fd);
-  }
-  goto exit;
-}
-
-static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
-  char line[SOCKET_NAME_MAXLEN+1];
-  if (errCode == 0) {
-    sock->state = ncclSocketStateConnected;
-  } else if (errCode == EINPROGRESS) {
-    sock->state = ncclSocketStateConnectPolling;
-  } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
-             errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
-    if (sock->customRetry == 0) {
-      if (sock->errorRetries++ == ncclParamRetryCnt()) {
-        sock->state = ncclSocketStateError;
-        WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
-             funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
-        return ncclRemoteError;
-      }
-      unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
-      INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
-           funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
-           sock->errorRetries, ncclParamRetryCnt(), sleepTime);
-      msleep(sleepTime);
-    }
-    NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
-    sock->state = ncclSocketStateConnecting;
-  } else {
-    sock->state = ncclSocketStateError;
-    WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
-  /* blocking/non-blocking connect() is determined by asyncFlag. */
-  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
-  return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
-}
-
-static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
-  struct pollfd pfd;
-  int timeout = 1, ret;
-  socklen_t rlen = sizeof(int);
-  char line[SOCKET_NAME_MAXLEN+1];
-
-  memset(&pfd, 0, sizeof(struct pollfd));
-  pfd.fd = sock->fd;
-  pfd.events = POLLOUT;
-  ret = poll(&pfd, 1, timeout);
-
-  if (ret == 0 || (ret < 0 && errno == EINTR)) {
-    return ncclSuccess;
-  } else if (ret < 0) {
-    WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-    return ncclSystemError;
-  }
-
-  /* check socket status */
-  SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
-  return socketConnectCheck(sock, ret, __func__);
-}
-
-ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
-  if (sock == NULL) {
-    WARN("ncclSocketPollConnect: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  NCCLCHECK(socketPollConnect(sock));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
-  int sent;
-  if (sock->asyncFlag == 0) {
-    sent = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-    sent = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
-  } else {
-    if (sock->finalizeCounter < sizeof(sock->magic)) {
-      sent = sock->finalizeCounter;
-      NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-      sock->finalizeCounter = sent;
-      if (sent < sizeof(sock->magic)) return ncclSuccess;
-    }
-    sent = sock->finalizeCounter - sizeof(sock->magic);
-    NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
-    sock->finalizeCounter = sent + sizeof(sock->magic);
-    if (sent < sizeof(sock->type)) return ncclSuccess;
-  }
-  sock->state = ncclSocketStateReady;
-  return ncclSuccess;
-}
-
-static ncclResult_t socketProgressState(struct ncclSocket* sock) {
-  if (sock->state == ncclSocketStateAccepting) {
-    NCCLCHECK(socketTryAccept(sock));
-  }
-  if (sock->state == ncclSocketStateAccepted) {
-    NCCLCHECK(socketFinalizeAccept(sock));
-  }
-  if (sock->state == ncclSocketStateConnecting) {
-    NCCLCHECK(socketStartConnect(sock));
-  }
-  if (sock->state == ncclSocketStateConnectPolling) {
-    NCCLCHECK(socketPollConnect(sock));
-  }
-  if (sock->state == ncclSocketStateConnected) {
-    NCCLCHECK(socketFinalizeConnect(sock));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
-  if (sock == NULL) {
-    *running = 0;
-    return ncclSuccess;
-  }
-  if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
-    WARN("ncclSocketReady: unexpected socket state %d", sock->state);
-    return ncclRemoteError;
-  }
-  *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
-  if (*running == 0) {
-    NCCLCHECK(socketProgressState(sock));
-    *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-
-  if (sock == NULL) {
-    WARN("ncclSocketConnect: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (sock->fd == -1) {
-    WARN("ncclSocketConnect: file descriptor is -1");
-    return ncclInvalidArgument;
-  }
-
-  if (sock->state != ncclSocketStateInitialized) {
-    WARN("ncclSocketConnect: wrong socket state %d", sock->state);
-    if (sock->state == ncclSocketStateError) return ncclRemoteError;
-    return ncclInternalError;
-  }
-  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
-
-  sock->state = ncclSocketStateConnecting;
-  sock->finalizeCounter = 0;
-  do {
-    NCCLCHECK(socketProgressState(sock));
-  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
-      (sock->state == ncclSocketStateConnecting ||
-       sock->state == ncclSocketStateConnectPolling ||
-       sock->state == ncclSocketStateConnected));
-
-  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
-
-  switch (sock->state) {
-    case ncclSocketStateConnecting:
-    case ncclSocketStateConnectPolling:
-    case ncclSocketStateConnected:
-    case ncclSocketStateReady:
-      return ncclSuccess;
-    case ncclSocketStateError:
-      return ncclSystemError;
-    default:
-      WARN("ncclSocketConnect: wrong socket state %d", sock->state);
-      return ncclInternalError;
-  }
-}
-
-ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
-  ncclResult_t ret = ncclSuccess;
-
-  if (listenSock == NULL || sock == NULL) {
-    WARN("ncclSocketAccept: pass NULL socket");
-    ret = ncclInvalidArgument;
-    goto exit;
-  }
-  if (listenSock->state != ncclSocketStateReady) {
-    WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
-    if (listenSock->state == ncclSocketStateError)
-      ret = ncclSystemError;
-    else
-      ret = ncclInternalError;
-    goto exit;
-  }
-
-  if (sock->acceptFd == -1) {
-    memcpy(sock, listenSock, sizeof(struct ncclSocket));
-    sock->acceptFd = listenSock->fd;
-    sock->state = ncclSocketStateAccepting;
-    sock->finalizeCounter = 0;
-  }
-
-  do {
-    NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
-  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
-      (sock->state == ncclSocketStateAccepting ||
-       sock->state == ncclSocketStateAccepted));
-
-  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
-
-  switch (sock->state) {
-    case ncclSocketStateAccepting:
-    case ncclSocketStateAccepted:
-    case ncclSocketStateReady:
-      ret = ncclSuccess;
-      break;
-    case ncclSocketStateError:
-      ret = ncclSystemError;
-      break;
-    default:
-      WARN("ncclSocketAccept: wrong socket state %d", sock->state);
-      ret = ncclInternalError;
-      break;
-  }
-
-exit:
-  return ret;
-}
-
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
-  ncclResult_t ret = ncclSuccess;
-
-  if (sock == NULL) goto exit;
-  sock->errorRetries = 0;
-  sock->abortFlag = abortFlag;
-  sock->asyncFlag = asyncFlag;
-  sock->state = ncclSocketStateInitialized;
-  sock->magic = magic;
-  sock->type = type;
-  sock->fd = -1;
-  sock->acceptFd = -1;
-  sock->customRetry = customRetry;
-
-  if (addr) {
-    /* IPv4/IPv6 support */
-    int family;
-    memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
-    family = sock->addr.sa.sa_family;
-    if (family != AF_INET && family != AF_INET6) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
-          ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
-      ret = ncclInternalError;
-      goto exit;
-    }
-    sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-    // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
-    NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
-  } else {
-    memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
-  }
-exit:
-  return ret;
-fail:
-  if (sock->fd != -1) {
-    close(sock->fd);
-    sock->fd = -1;
-  }
-  goto exit;
-}
-
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
-  if (sock == NULL) {
-    WARN("ncclSocketProgress: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
-  if (sock == NULL) {
-    WARN("ncclSocketWait: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  NCCLCHECK(socketWait(op, sock, ptr, size, offset));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
-  int offset = 0;
-  if (sock == NULL) {
-    WARN("ncclSocketSend: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (sock->state != ncclSocketStateReady) {
-    WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
-    return ncclInternalError;
-  }
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
-  int offset = 0;
-  if (sock == NULL) {
-    WARN("ncclSocketRecv: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
-    WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
-    return ncclInternalError;
-  }
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
-  int sendOffset = 0, recvOffset = 0;
-  if (sendSock == NULL || recvSock == NULL) {
-    WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
-    return ncclInternalError;
-  }
-  if (sendSock->state != ncclSocketStateReady ||
-      (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
-    WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
-    return ncclInternalError;
-  }
-  while (sendOffset < sendSize || recvOffset < recvSize) {
-    if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
-    if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
-  }
-  return ncclSuccess;
-}
-
-
-// Receive or detect connection closed
-ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
-  int offset = 0;
-  if (sock == NULL) {
-    WARN("ncclSocketTryRecv: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  *closed = 0;
-  // Block until connection closes or nbytes received
-  if (blocking) {
-    while (offset < size) {
-      NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
-      if (*closed) return ncclSuccess;
-    }
-  } else {
-    NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
-    if (*closed) return ncclSuccess;
-
-    // If any bytes were received, block waiting for the rest
-    if (offset > 0) {
-      while (offset < size) {
-        NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
-        if (*closed) return ncclSuccess;
-      }
-    // No bytes were received, return ncclInProgress
-    } else {
-      return ncclInProgress;
-    }
-  }
-  return ncclSuccess;
-}
-
-// Make it possible to close just one part of a socket.
-ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
-  if (sock != NULL) {
-    if (sock->fd >= 0) {
-      SYSCHECK(shutdown(sock->fd, how), "shutdown");
-    }
-    sock->state = ncclSocketStateTerminating;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
-  if (sock != NULL) {
-    if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
-      if (wait) {
-        char data;
-        int closed = 0;
-        do {
-          int offset = 0;
-          if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
-        } while (closed == 0);
-      }
-      /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
-       * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
-       * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
-       * connection close here. */
-      (void)shutdown(sock->fd, SHUT_RDWR);
-      (void)close(sock->fd);
-    }
-    sock->state = ncclSocketStateClosed;
-    sock->fd = -1;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
-  if (sock == NULL) {
-    WARN("ncclSocketGetFd: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (fd) *fd = sock->fd;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
-  if (sock == NULL) {
-    WARN("ncclSocketGetFd: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  sock->fd = fd;
-  return ncclSuccess;
-}
+ #include "socket.h"
+ #include "utils.h"
+ #include <stdlib.h>
+ 
+ #include <unistd.h>
+ #include <ifaddrs.h>
+ #include <net/if.h>
+ #include "param.h"
+ #include <time.h>
+ 
+ NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
+ NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
+ static void msleep(unsigned int time_msec) {
+   const long c_1e6 = 1e6;
+   struct timespec tv = (struct timespec){
+       .tv_sec = time_msec / 1000,
+       .tv_nsec = (time_msec % 1000) * c_1e6,
+   };
+   nanosleep(&tv, NULL);
+ }
+ 
+ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+   int bytes = 0;
+   *closed = 0;
+   char* data = (char*)ptr;
+   char line[SOCKET_NAME_MAXLEN+1];
+   do {
+     if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+     if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
+     if (op == NCCL_SOCKET_RECV && bytes == 0) {
+       *closed = 1;
+       return ncclSuccess;
+     }
+     if (bytes == -1) {
+       if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
+         *closed = 1;
+         return ncclSuccess;
+       }
+       if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+         WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
+              ncclSocketToString(&sock->addr, line), strerror(errno));
+         return ncclRemoteError;
+       } else {
+         bytes = 0;
+       }
+     }
+     (*offset) += bytes;
+     if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) {
+       INFO(NCCL_NET, "socketProgressOpt: abort called");
+       return ncclInternalError;
+     }
+   } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
+   int closed;
+   NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
+   if (closed) {
+     if (pclosed) {
+       *pclosed = closed;
+       return ncclSuccess;
+     } else {
+       char line[SOCKET_NAME_MAXLEN+1];
+       WARN("socketProgress: Connection closed by remote peer %s",
+            ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
+       return ncclRemoteError;
+     }
+   }
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+   while (*offset < size)
+     NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+   return ncclSuccess;
+ }
+ 
+ /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
+  *
+  * Output: "IPv4/IPv6 address<port>"
+  */
+ const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+   const struct sockaddr *saddr;
+   char host[NI_MAXHOST], service[NI_MAXSERV];
+   int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
+   if (buf == NULL || addr == NULL) goto fail;
+   saddr = &addr->sa;
+   if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
+   /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
+    * (When not set, this will still happen in case the node's name cannot be determined.)
+    */
+   if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
+   sprintf(buf, "%s<%s>", host, service);
+   return buf;
+ fail:
+   if (buf)
+     buf[0] = '\0';
+   return buf;
+ }
+ 
+ static uint16_t socketToPort(union ncclSocketAddress *addr) {
+   struct sockaddr *saddr = &addr->sa;
+   return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
+ }
+ 
+ /* Allow the user to force the IPv4/IPv6 interface selection */
+ static int envSocketFamily(void) {
+   int family = -1; // Family selection is not forced, will use first one found
+   const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
+   if (env == NULL)
+     return family;
+ 
+   INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+ 
+   if (strcmp(env, "AF_INET") == 0)
+     family = AF_INET;  // IPv4
+   else if (strcmp(env, "AF_INET6") == 0)
+     family = AF_INET6; // IPv6
+   return family;
+ }
+ 
+ static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
+                                    int maxIfNameSize, int maxIfs, int* found) {
+ #ifdef ENABLE_TRACE
+   char line[SOCKET_NAME_MAXLEN+1];
+ #endif
+   struct netIf userIfs[MAX_IFS];
+   bool searchNot = prefixList && prefixList[0] == '^';
+   if (searchNot) prefixList++;
+   bool searchExact = prefixList && prefixList[0] == '=';
+   if (searchExact) prefixList++;
+   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+ 
+   *found = 0;
+   struct ifaddrs *interfaces, *interface;
+   SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+   for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
+     if (interface->ifa_addr == NULL) continue;
+ 
+     /* We only support IPv4 & IPv6 */
+     int family = interface->ifa_addr->sa_family;
+     if (family != AF_INET && family != AF_INET6)
+       continue;
+ 
+     TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
+ 
+     /* Allow the caller to force the socket family type */
+     if (sock_family != -1 && family != sock_family)
+       continue;
+ 
+     /* We also need to skip IPv6 loopback interfaces */
+     if (family == AF_INET6) {
+       struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+       if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+     }
+ 
+     // check against user specified interfaces
+     if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+       continue;
+     }
+ 
+     // Check that this interface has not already been saved
+     // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+     bool duplicate = false;
+     for (int i = 0; i < *found; i++) {
+       if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+     }
+ 
+     if (!duplicate) {
+       // Store the interface name
+       strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+       // Store the IP address
+       int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+       memset(addrs + *found, '\0', sizeof(*addrs));
+       memcpy(addrs + *found, interface->ifa_addr, salen);
+       (*found)++;
+     }
+   }
+ 
+   freeifaddrs(interfaces);
+   return ncclSuccess;
+ }
+ 
+ static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
+   /* Check family first */
+   int family = local_if.ifa_addr->sa_family;
+   if (family != remote->sa.sa_family) {
+     return false;
+   }
+ 
+   if (family == AF_INET) {
+     struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+     struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+     struct sockaddr_in& remote_addr = remote->sin;
+     struct in_addr local_subnet, remote_subnet;
+     local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+     remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+     return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+   } else if (family == AF_INET6) {
+     struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+     struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+     struct sockaddr_in6& remote_addr = remote->sin6;
+     struct in6_addr& local_in6 = local_addr->sin6_addr;
+     struct in6_addr& mask_in6 = mask->sin6_addr;
+     struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+     bool same = true;
+     int len = 16;  //IPv6 address is 16 unsigned char
+     for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+       char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+       char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+       if (c1 ^ c2) {
+         same = false;
+         break;
+       }
+     }
+     // At last, we need to compare scope id
+     // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+     // For Global type, this field is 0, so a comparison wouldn't matter
+     same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+     return same;
+   } else {
+     INFO(NCCL_NET, "Net : Unsupported address family type");
+     return false;
+   }
+ }
+ 
+ ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
+                                           union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
+ #ifdef ENABLE_TRACE
+   char line[SOCKET_NAME_MAXLEN+1];
+   char line_a[SOCKET_NAME_MAXLEN+1];
+ #endif
+   *found = 0;
+   struct ifaddrs *interfaces, *interface;
+   SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+   for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
+     if (interface->ifa_addr == NULL) continue;
+ 
+     /* We only support IPv4 & IPv6 */
+     int family = interface->ifa_addr->sa_family;
+     if (family != AF_INET && family != AF_INET6)
+       continue;
+ 
+     // check against user specified interfaces
+     if (!matchSubnet(*interface, remoteAddr)) {
+       continue;
+     }
+ 
+     // Store the local IP address
+     int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+     memcpy(localAddr, interface->ifa_addr, salen);
+ 
+     // Store the interface name
+     strncpy(ifName, interface->ifa_name, ifNameMaxSize);
+ 
+     TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
+           interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
+     *found = 1;
+   }
+ 
+   freeifaddrs(interfaces);
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+   if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+     WARN("Net : string is null");
+     return ncclInvalidArgument;
+   }
+ 
+   bool ipv6 = ip_port_pair[0] == '[';
+   /* Construct the sockaddress structure */
+   if (!ipv6) {
+     struct netIf ni;
+     // parse <ip_or_hostname>:<port> string, expect one pair
+     if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+       WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+       return ncclInvalidArgument;
+     }
+ 
+     struct addrinfo hints, *p;
+     int rv;
+     memset(&hints, 0, sizeof(hints));
+     hints.ai_family = AF_UNSPEC;
+     hints.ai_socktype = SOCK_STREAM;
+ 
+     if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+       WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+       return ncclInvalidArgument;
+     }
+ 
+     // use the first
+     if (p->ai_family == AF_INET) {
+       struct sockaddr_in& sin = ua->sin;
+       memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+       sin.sin_family = AF_INET;                        // IPv4
+       //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+       sin.sin_port = htons(ni.port);                   // port
+     } else if (p->ai_family == AF_INET6) {
+       struct sockaddr_in6& sin6 = ua->sin6;
+       memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+       sin6.sin6_family = AF_INET6;                     // IPv6
+       sin6.sin6_port = htons(ni.port);                 // port
+       sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+       sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+     } else {
+       WARN("Net : unsupported IP family");
+       freeaddrinfo(p);
+       return ncclInvalidArgument;
+     }
+ 
+     freeaddrinfo(p); // all done with this structure
+ 
+   } else {
+     int i, j = -1, len = strlen(ip_port_pair);
+     for (i = 1; i < len; i++) {
+       if (ip_port_pair[i] == '%') j = i;
+       if (ip_port_pair[i] == ']') break;
+     }
+     if (i == len) {
+       WARN("Net : No valid [IPv6]:port pair found");
+       return ncclInvalidArgument;
+     }
+     bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+ 
+     char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+     memset(ip_str, '\0', sizeof(ip_str));
+     memset(port_str, '\0', sizeof(port_str));
+     memset(if_name, '\0', sizeof(if_name));
+     strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+     strncpy(port_str, ip_port_pair+i+2, len-i-1);
+     int port = atoi(port_str);
+     if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+ 
+     struct sockaddr_in6& sin6 = ua->sin6;
+     sin6.sin6_family = AF_INET6;                       // IPv6
+     inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+     sin6.sin6_port = htons(port);                      // port
+     sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+     sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
+                                 int* nIfs) {
+   static int shownIfName = 0;
+   // Allow user to force the INET socket family selection
+   int sock_family = envSocketFamily();
+   // User specified interface
+   const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
+   *nIfs = 0;
+   if (env && strlen(env) > 1) {
+     INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
+     // Specified by user : find or fail
+     if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
+     NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+   } else {
+     // Try to automatically pick the right one
+     // Start with IB
+     NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+     // else see if we can get some hint from COMM ID
+     if (*nIfs == 0) {
+       const char* commId = ncclGetEnv("NCCL_COMM_ID");
+       if (commId && strlen(commId) > 1) {
+         INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+         // Try to find interface that is in the same subnet as the IP in comm id
+         union ncclSocketAddress idAddr;
+         NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
+         NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
+       }
+     }
+     // Then look for anything else (but not docker or lo)
+     if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+     // Finally look for docker, then lo.
+     if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+     if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
+   if (sock == NULL) {
+     WARN("ncclSocketListen: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (sock->fd == -1) {
+     WARN("ncclSocketListen: file descriptor is -1");
+     return ncclInvalidArgument;
+   }
+ 
+   if (socketToPort(&sock->addr)) {
+     // Port is forced by env. Make sure we get the port.
+     int opt = 1;
+     SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+ #if defined(SO_REUSEPORT)
+     SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+ #endif
+   }
+ 
+   // addr port should be 0 (Any port)
+   SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
+ 
+   /* Get the assigned Port */
+   socklen_t size = sock->salen;
+   SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
+ 
+ #ifdef ENABLE_TRACE
+   char line[SOCKET_NAME_MAXLEN+1];
+   TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
+ #endif
+ 
+   /* Put the socket in listen mode
+    * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+    */
+   SYSCHECK(listen(sock->fd, 16384), "listen");
+   sock->state = ncclSocketStateReady;
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
+   if (sock == NULL) {
+     WARN("ncclSocketGetAddr: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (sock->state != ncclSocketStateReady) return ncclInternalError;
+   memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
+   socklen_t socklen = sizeof(union ncclSocketAddress);
+   sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
+   if (sock->fd != -1) {
+     sock->state = ncclSocketStateAccepted;
+   } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
+              errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH ||
+              errno == EINTR) {
+     /* per accept's man page, for linux sockets, the following errors might be already pending errors
+      * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
+     if (++sock->errorRetries == ncclParamRetryCnt()) {
+       WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
+       return ncclSystemError;
+     }
+     INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
+   } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
+     WARN("socketTryAccept: Accept failed: %s", strerror(errno));
+     return ncclSystemError;
+   }
+   return ncclSuccess;
+ }
+ 
+ NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
+ NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
+ 
+ static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
+   const int one = 1;
+   /* Set socket as non-blocking if async or if we need to be able to abort */
+   if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+     int flags;
+     SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
+     SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+   }
+   SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
+   // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
+   int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
+   if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
+   if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
+   return ncclSuccess;
+ }
+ 
+ static void socketResetAccept(struct ncclSocket* sock) {
+   char line[SOCKET_NAME_MAXLEN+1];
+   INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
+        ncclSocketToString(&sock->addr, line));
+   // Ignore spurious connection and accept again
+   (void)close(sock->fd);
+   sock->fd = -1;
+   sock->state = ncclSocketStateAccepting;
+   sock->finalizeCounter = 0;
+ }
+ 
+ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
+   uint64_t magic;
+   enum ncclSocketType type;
+   int received;
+   char line[SOCKET_NAME_MAXLEN+1];
+   // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
+   NCCLCHECK(socketSetFlags(sock));
+ 
+   if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
+     if (sock->asyncFlag == 0) {
+       received = 0;
+       if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
+         INFO(NCCL_INIT,"magic failed 1");
+         socketResetAccept(sock);
+         return ncclSuccess;
+       }
+     } else {
+       int closed = 0;
+       received = sock->finalizeCounter;
+       NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
+       sock->finalizeCounter = received;
+       if (received < sizeof(magic)) {
+         if (closed) {
+           INFO(NCCL_INIT,"magic failed 2");
+           socketResetAccept(sock);
+         }
+         return ncclSuccess;
+       }
+       memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
+     }
+     if (magic != sock->magic) {
+       INFO(NCCL_INIT,"magic failed 3 %lu != %lu", magic, sock->magic);
+      //  socketResetAccept(sock);
+      //  return ncclSuccess;
+      sock->magic = magic; // For debug purposes, accept any magic
+     }
+   }
+   if (sock->asyncFlag == 0) {
+     received = 0;
+     NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+   } else {
+     received = sock->finalizeCounter - sizeof(magic);
+     NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
+     sock->finalizeCounter = received + sizeof(magic);
+     if (received < sizeof(type)) return ncclSuccess;
+     memcpy(&type, sock->finalizeBuffer, sizeof(type));
+   }
+   if (type != sock->type) {
+     WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
+     sock->state = ncclSocketStateError;
+     close(sock->fd);
+     sock->fd = -1;
+     return ncclInternalError;
+   } else {
+     sock->state = ncclSocketStateReady;
+   }
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketResetFd(struct ncclSocket* sock) {
+   ncclResult_t ret = ncclSuccess;
+   int fd = -1;
+   SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
+   // if sock->fd is valid, close it and reuse its number
+   if (sock->fd != -1) {
+     SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
+     SYSCHECKGOTO(close(fd), "close", ret, cleanup);
+   } else {
+     sock->fd = fd;
+   }
+   NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
+ exit:
+   return ret;
+ cleanup:
+   // cleanup fd, leave sock->fd untouched
+   if (fd != -1) {
+     (void)close(fd);
+   }
+   goto exit;
+ }
+ 
+ static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+   char line[SOCKET_NAME_MAXLEN+1];
+   if (errCode == 0) {
+     sock->state = ncclSocketStateConnected;
+   } else if (errCode == EINPROGRESS) {
+     sock->state = ncclSocketStateConnectPolling;
+   } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
+              errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+     if (sock->customRetry == 0) {
+       if (sock->errorRetries++ == ncclParamRetryCnt()) {
+         sock->state = ncclSocketStateError;
+         WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
+              funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
+         return ncclRemoteError;
+       }
+       unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
+       INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
+            funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
+            sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+       msleep(sleepTime);
+     }
+     NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
+     sock->state = ncclSocketStateConnecting;
+   } else {
+     sock->state = ncclSocketStateError;
+     WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
+     return ncclSystemError;
+   }
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+   /* blocking/non-blocking connect() is determined by asyncFlag. */
+   int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+   return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
+ }
+ 
+ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
+   struct pollfd pfd;
+   int timeout = 1, ret;
+   socklen_t rlen = sizeof(int);
+   char line[SOCKET_NAME_MAXLEN+1];
+ 
+   memset(&pfd, 0, sizeof(struct pollfd));
+   pfd.fd = sock->fd;
+   pfd.events = POLLOUT;
+   ret = poll(&pfd, 1, timeout);
+ 
+   if (ret == 0 || (ret < 0 && errno == EINTR)) {
+     return ncclSuccess;
+   } else if (ret < 0) {
+     WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+     return ncclSystemError;
+   }
+ 
+   /* check socket status */
+   SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+   return socketConnectCheck(sock, ret, __func__);
+ }
+ 
+ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
+   if (sock == NULL) {
+     WARN("ncclSocketPollConnect: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   NCCLCHECK(socketPollConnect(sock));
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
+   int sent;
+   if (sock->asyncFlag == 0) {
+     sent = 0;
+     NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+     sent = 0;
+     NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+   } else {
+     if (sock->finalizeCounter < sizeof(sock->magic)) {
+       sent = sock->finalizeCounter;
+       NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+       sock->finalizeCounter = sent;
+       if (sent < sizeof(sock->magic)) return ncclSuccess;
+     }
+     sent = sock->finalizeCounter - sizeof(sock->magic);
+     NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+     sock->finalizeCounter = sent + sizeof(sock->magic);
+     if (sent < sizeof(sock->type)) return ncclSuccess;
+   }
+   sock->state = ncclSocketStateReady;
+   return ncclSuccess;
+ }
+ 
+ static ncclResult_t socketProgressState(struct ncclSocket* sock) {
+   if (sock->state == ncclSocketStateAccepting) {
+     NCCLCHECK(socketTryAccept(sock));
+   }
+   if (sock->state == ncclSocketStateAccepted) {
+     NCCLCHECK(socketFinalizeAccept(sock));
+   }
+   if (sock->state == ncclSocketStateConnecting) {
+     NCCLCHECK(socketStartConnect(sock));
+   }
+   if (sock->state == ncclSocketStateConnectPolling) {
+     NCCLCHECK(socketPollConnect(sock));
+   }
+   if (sock->state == ncclSocketStateConnected) {
+     NCCLCHECK(socketFinalizeConnect(sock));
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
+   if (sock == NULL) {
+     *running = 0;
+     return ncclSuccess;
+   }
+   if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
+     WARN("ncclSocketReady: unexpected socket state %d", sock->state);
+     return ncclRemoteError;
+   }
+   *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+   if (*running == 0) {
+     NCCLCHECK(socketProgressState(sock));
+     *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+ #ifdef ENABLE_TRACE
+   char line[SOCKET_NAME_MAXLEN+1];
+ #endif
+ 
+   if (sock == NULL) {
+     WARN("ncclSocketConnect: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (sock->fd == -1) {
+     WARN("ncclSocketConnect: file descriptor is -1");
+     return ncclInvalidArgument;
+   }
+ 
+   if (sock->state != ncclSocketStateInitialized) {
+     WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+     if (sock->state == ncclSocketStateError) return ncclRemoteError;
+     return ncclInternalError;
+   }
+   TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+ 
+   sock->state = ncclSocketStateConnecting;
+   sock->finalizeCounter = 0;
+   do {
+     NCCLCHECK(socketProgressState(sock));
+   } while (sock->asyncFlag == 0 &&
+       (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
+       (sock->state == ncclSocketStateConnecting ||
+        sock->state == ncclSocketStateConnectPolling ||
+        sock->state == ncclSocketStateConnected));
+ 
+   if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
+ 
+   switch (sock->state) {
+     case ncclSocketStateConnecting:
+     case ncclSocketStateConnectPolling:
+     case ncclSocketStateConnected:
+     case ncclSocketStateReady:
+       return ncclSuccess;
+     case ncclSocketStateError:
+       return ncclSystemError;
+     default:
+       WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+       return ncclInternalError;
+   }
+ }
+ 
+ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
+   ncclResult_t ret = ncclSuccess;
+ 
+   if (listenSock == NULL || sock == NULL) {
+     WARN("ncclSocketAccept: pass NULL socket");
+     ret = ncclInvalidArgument;
+     goto exit;
+   }
+   if (listenSock->state != ncclSocketStateReady) {
+     WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
+     if (listenSock->state == ncclSocketStateError)
+       ret = ncclSystemError;
+     else
+       ret = ncclInternalError;
+     goto exit;
+   }
+ 
+   if (sock->acceptFd == -1) {
+     memcpy(sock, listenSock, sizeof(struct ncclSocket));
+     sock->acceptFd = listenSock->fd;
+     sock->state = ncclSocketStateAccepting;
+     sock->finalizeCounter = 0;
+   }
+ 
+   do {
+     NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
+   } while (sock->asyncFlag == 0 &&
+       (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
+       (sock->state == ncclSocketStateAccepting ||
+        sock->state == ncclSocketStateAccepted));
+ 
+   if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
+ 
+   switch (sock->state) {
+     case ncclSocketStateAccepting:
+     case ncclSocketStateAccepted:
+     case ncclSocketStateReady:
+       ret = ncclSuccess;
+       break;
+     case ncclSocketStateError:
+       ret = ncclSystemError;
+       break;
+     default:
+       WARN("ncclSocketAccept: wrong socket state %d", sock->state);
+       ret = ncclInternalError;
+       break;
+   }
+ 
+ exit:
+   return ret;
+ }
+ 
+ ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
+   ncclResult_t ret = ncclSuccess;
+ 
+   if (sock == NULL) goto exit;
+   sock->errorRetries = 0;
+   sock->abortFlag = abortFlag;
+   sock->asyncFlag = asyncFlag;
+   sock->state = ncclSocketStateInitialized;
+   sock->magic = magic;
+   sock->type = type;
+   sock->fd = -1;
+   sock->acceptFd = -1;
+   sock->customRetry = customRetry;
+ 
+   if (addr) {
+     /* IPv4/IPv6 support */
+     int family;
+     memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+     family = sock->addr.sa.sa_family;
+     if (family != AF_INET && family != AF_INET6) {
+       char line[SOCKET_NAME_MAXLEN+1];
+       WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+           ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+       ret = ncclInternalError;
+       goto exit;
+     }
+     sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+     // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
+     NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
+   } else {
+     memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
+   }
+ exit:
+   return ret;
+ fail:
+   if (sock->fd != -1) {
+     close(sock->fd);
+     sock->fd = -1;
+   }
+   goto exit;
+ }
+ 
+ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
+   if (sock == NULL) {
+     WARN("ncclSocketProgress: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+   if (sock == NULL) {
+     WARN("ncclSocketWait: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   NCCLCHECK(socketWait(op, sock, ptr, size, offset));
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
+   int offset = 0;
+   if (sock == NULL) {
+     WARN("ncclSocketSend: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (sock->state != ncclSocketStateReady) {
+     WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
+     return ncclInternalError;
+   }
+   NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
+   int offset = 0;
+   if (sock == NULL) {
+     WARN("ncclSocketRecv: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
+     WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
+     return ncclInternalError;
+   }
+   NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
+   int sendOffset = 0, recvOffset = 0;
+   if (sendSock == NULL || recvSock == NULL) {
+     WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
+     return ncclInternalError;
+   }
+   if (sendSock->state != ncclSocketStateReady ||
+       (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
+     WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
+     return ncclInternalError;
+   }
+   while (sendOffset < sendSize || recvOffset < recvSize) {
+     if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
+     if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
+   }
+   return ncclSuccess;
+ }
+ 
+ // Receive or detect connection closed
+ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
+   int offset = 0;
+   if (sock == NULL) {
+     WARN("ncclSocketTryRecv: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   *closed = 0;
+   // Block until connection closes or nbytes received
+   if (blocking) {
+     while (offset < size) {
+       NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+       if (*closed) return ncclSuccess;
+     }
+   } else {
+     NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+     if (*closed) return ncclSuccess;
+ 
+     // If any bytes were received, block waiting for the rest
+     if (offset > 0) {
+       while (offset < size) {
+         NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+         if (*closed) return ncclSuccess;
+       }
+     // No bytes were received, return ncclInProgress
+     } else {
+       return ncclInProgress;
+     }
+   }
+   return ncclSuccess;
+ }
+ 
+ // Make it possible to close just one part of a socket.
+ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
+   if (sock != NULL) {
+     if (sock->fd >= 0) {
+       SYSCHECK(shutdown(sock->fd, how), "shutdown");
+     }
+     sock->state = ncclSocketStateTerminating;
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
+   if (sock != NULL) {
+     if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+       if (wait) {
+         char data;
+         int closed = 0;
+         do {
+           int offset = 0;
+           if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+         } while (closed == 0);
+       }
+       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
+        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
+        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
+        * connection close here. */
+       (void)shutdown(sock->fd, SHUT_RDWR);
+       (void)close(sock->fd);
+     }
+     sock->state = ncclSocketStateClosed;
+     sock->fd = -1;
+   }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
+   if (sock == NULL) {
+     WARN("ncclSocketGetFd: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   if (fd) *fd = sock->fd;
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
+   if (sock == NULL) {
+     WARN("ncclSocketGetFd: pass NULL socket");
+     return ncclInvalidArgument;
+   }
+   sock->fd = fd;
+   return ncclSuccess;
+ }
+ 
\ No newline at end of file
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 292a839..9cbdf36 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -1,8 +1,8 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
 
 #ifndef NCCL_H_
 #define NCCL_H_
@@ -37,16 +37,27 @@ typedef struct ncclWindow* ncclWindow_t;
 #define NCCL_UNIQUE_ID_BYTES 128
 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
 
+#define NCCL_NEW_RANK_INFO_BYTES (10 * 1024 * 1024)
+typedef struct alignas(16) {
+char* internal;
+} ncclNewRankInfo;
+
+#define NCCL_COMM_INFO_BYTES (10 * 1024 * 1024)
+typedef struct alignas(16) {
+    //char internal[NCCL_COMM_INFO_BYTES];
+    char* internal;
+} ncclCommInfo;
+
 /* Error type */
 typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidArgument         =  4,
-               ncclInvalidUsage            =  5,
-               ncclRemoteError             =  6,
-               ncclInProgress              =  7,
-               ncclNumResults              =  8 } ncclResult_t;
+            ncclUnhandledCudaError      =  1,
+            ncclSystemError             =  2,
+            ncclInternalError           =  3,
+            ncclInvalidArgument         =  4,
+            ncclInvalidUsage            =  5,
+            ncclRemoteError             =  6,
+            ncclInProgress              =  7,
+            ncclNumResults              =  8 } ncclResult_t;
 
 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
@@ -66,45 +77,45 @@ typedef enum { ncclSuccess                 =  0,
 #define NCCL_SHRINK_ABORT 0x01   /* First, terminate ongoing parent operations, and then shrink the parent communicator */
 
 /* Communicator configuration. Users can assign value to attributes to specify the
- * behavior of a communicator. */
+* behavior of a communicator. */
 typedef struct ncclConfig_v22700 {
-  /* attributes that users should never touch. */
-  size_t size;
-  unsigned int magic;
-  unsigned int version;
-  /* attributes that users are able to customize. */
-  int blocking;
-  int cgaClusterSize;
-  int minCTAs;
-  int maxCTAs;
-  const char *netName;
-  int splitShare;
-  int trafficClass;
-  const char *commName;
-  int collnetEnable;
-  int CTAPolicy;
-  int shrinkShare;
-  int nvlsCTAs;
+/* attributes that users should never touch. */
+size_t size;
+unsigned int magic;
+unsigned int version;
+/* attributes that users are able to customize. */
+int blocking;
+int cgaClusterSize;
+int minCTAs;
+int maxCTAs;
+const char *netName;
+int splitShare;
+int trafficClass;
+const char *commName;
+int collnetEnable;
+int CTAPolicy;
+int shrinkShare;
+int nvlsCTAs;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
- * Not initialized config will result in NCCL error. */
+* Not initialized config will result in NCCL error. */
 #define NCCL_CONFIG_INITIALIZER {                                       \
-  sizeof(ncclConfig_t), /* size */                                      \
-  0xcafebeef,           /* magic */                                     \
-  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
-  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
-  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
-  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
-  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
-  NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
-  NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
-  NCCL_CONFIG_UNDEF_PTR,                    /* commName */              \
-  NCCL_CONFIG_UNDEF_INT,                    /* collnetEnable */         \
-  NCCL_CONFIG_UNDEF_INT,                    /* CTAPolicy */             \
-  NCCL_CONFIG_UNDEF_INT,                    /* shrinkShare */           \
-  NCCL_CONFIG_UNDEF_INT,                    /* nvlsCTAs */              \
+sizeof(ncclConfig_t), /* size */                                      \
+0xcafebeef,           /* magic */                                     \
+NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
+NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
+NCCL_CONFIG_UNDEF_PTR,                    /* commName */              \
+NCCL_CONFIG_UNDEF_INT,                    /* collnetEnable */         \
+NCCL_CONFIG_UNDEF_INT,                    /* CTAPolicy */             \
+NCCL_CONFIG_UNDEF_INT,                    /* shrinkShare */           \
+NCCL_CONFIG_UNDEF_INT,                    /* nvlsCTAs */              \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
@@ -116,17 +127,17 @@ typedef struct ncclSimInfo_v22200 {
 } ncclSimInfo_t;
 
 /* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
- * Not initialized simInfo will result in NCCL error. */
+* Not initialized simInfo will result in NCCL error. */
 #define NCCL_SIM_INFO_INITIALIZER {                                         \
-  sizeof(ncclSimInfo_t),                            /* size */              \
-  0x74685283,                                       /* magic */             \
-  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */           \
-  NCCL_UNDEF_FLOAT                                  /* estimated time */    \
+sizeof(ncclSimInfo_t),                            /* size */              \
+0x74685283,                                       /* magic */             \
+NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */           \
+NCCL_UNDEF_FLOAT                                  /* estimated time */    \
 }
 
 /* NCCL malloc and free function for all types of NCCL optimizations
- * (e.g. user buffer registration). The actual allocated size might
- * be larger than requested due to granularity requirement. */
+* (e.g. user buffer registration). The actual allocated size might
+* be larger than requested due to granularity requirement. */
 ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
 ncclResult_t pncclMemAlloc(void** ptr, size_t size);
 
@@ -134,46 +145,57 @@ ncclResult_t  ncclMemFree(void *ptr);
 ncclResult_t pncclMemFree(void *ptr);
 
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
- * This integer is coded with the MAJOR, MINOR and PATCH level of the
- * NCCL library
- */
+* This integer is coded with the MAJOR, MINOR and PATCH level of the
+* NCCL library
+*/
 ncclResult_t  ncclGetVersion(int *version);
 ncclResult_t pncclGetVersion(int *version);
 
 /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
- * called once and the Id should be distributed to all ranks in the
- * communicator before calling ncclCommInitRank. */
+* called once and the Id should be distributed to all ranks in the
+* communicator before calling ncclCommInitRank. */
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
 /* Create a new communicator (multi thread/process version) with a configuration
- * set by users. */
+* set by users. */
 ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
 ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
 
 /* Creates a new communicator (multi thread/process version).
- * rank must be between 0 and nranks-1 and unique within a communicator clique.
- * Each rank is associated to a CUDA device, which has to be set before calling
- * ncclCommInitRank.
- * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
- * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+* rank must be between 0 and nranks-1 and unique within a communicator clique.
+* Each rank is associated to a CUDA device, which has to be set before calling
+* ncclCommInitRank.
+* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
 ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 
+ncclResult_t ncclCommInitNewRank(ncclComm_t* comm, int nRanks);
+ncclResult_t ncclCommAddNewRank(ncclComm_t comm);
+ncclResult_t ncclCommExportInfo(ncclComm_t comm, ncclUniqueId* commId, ncclCommInfo* commInfo);
+ncclResult_t ncclCommSetupNewRank(ncclComm_t comm);
+
+ncclResult_t ncclBootstrapBroadcast(ncclComm_t comm, int root, void* buffer, size_t size);
+
 /* Creates a clique of communicators (single process version).
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
+* This is a convenience function to create a single-process communicator clique.
+* Returns an array of ndev newly initialized communicators in comm.
+* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+* If devlist is NULL, the first ndev CUDA devices are used.
+* Order of devlist defines user-order of processors within the communicator. */
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
+/* Restore a specific rank from GPU reset and clear hanged operations in other rank. */
+ncclResult_t ncclRestoreRank(ncclComm_t comm, int rank);
+ncclResult_t pncclRestoreRank(ncclComm_t comm, int rank);
+
 /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
- * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
- * when the communicator is globally quiescent and related resources are freed; then,
- * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
- * itself) without blocking. */
+* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+* when the communicator is globally quiescent and related resources are freed; then,
+* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+* itself) without blocking. */
 ncclResult_t  ncclCommFinalize(ncclComm_t comm);
 ncclResult_t pncclCommFinalize(ncclComm_t comm);
 
@@ -182,32 +204,34 @@ ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
 /* Frees resources associated with communicator object and aborts any operations
- * that might still be running on the device. */
+* that might still be running on the device. */
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 
+ncclResult_t  ncclRemoveRank(ncclComm_t comm, int rank);
+
 /* Creates one or more communicators from an existing one.
- * Ranks with the same color will end up in the same communicator.
- * Within the new communicator, key will be used to order ranks.
- * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
- * and will therefore return a NULL communicator.
- * If config is NULL, the new communicator will inherit the original communicator's
- * configuration*/
+* Ranks with the same color will end up in the same communicator.
+* Within the new communicator, key will be used to order ranks.
+* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+* and will therefore return a NULL communicator.
+* If config is NULL, the new communicator will inherit the original communicator's
+* configuration*/
 ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 
 /* Shrink existing communicator.
- * Ranks in excludeRanksList will be removed form the existing communicator.
- * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
- * If config is NULL, the new communicator will inherit the original communicator's configuration
- * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
+* Ranks in excludeRanksList will be removed form the existing communicator.
+* Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
+* If config is NULL, the new communicator will inherit the original communicator's configuration
+* The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
 ncclResult_t  ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
 ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
 
 /* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
- * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
- * The number of ncclUniqueIds and their order must be the same for every rank.
- */
+* Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
+* The number of ncclUniqueIds and their order must be the same for every rank.
+*/
 ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
 ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
 
@@ -258,153 +282,153 @@ ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
 /* Reduction operation selector */
 typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
 typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               ncclAvg        = 4,
-               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+            ncclProd       = 1,
+            ncclMax        = 2,
+            ncclMin        = 3,
+            ncclAvg        = 4,
+            /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
                 * serves as the least possible value for dynamic ncclRedOp_t's
                 * as constructed by ncclRedOpCreate*** functions. */
-               ncclNumOps     = 5,
-               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+            ncclNumOps     = 5,
+            /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
                 * It is defined to be the largest signed value (since compilers
                 * are permitted to use signed enums) that won't grow
                 * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
                 * maintain ABI compatibility. */
-               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
-             } ncclRedOp_t;
+            ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+            } ncclRedOp_t;
 
 /* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
-               ncclUint8      = 1,
-               ncclInt32      = 2, ncclInt        = 2,
-               ncclUint32     = 3,
-               ncclInt64      = 4,
-               ncclUint64     = 5,
-               ncclFloat16    = 6, ncclHalf       = 6,
-               ncclFloat32    = 7, ncclFloat      = 7,
-               ncclFloat64    = 8, ncclDouble     = 8,
-               ncclBfloat16   = 9,
-               ncclFloat8e4m3 = 10,
-               ncclFloat8e5m2 = 11,
-               ncclNumTypes   = 12
+            ncclUint8      = 1,
+            ncclInt32      = 2, ncclInt        = 2,
+            ncclUint32     = 3,
+            ncclInt64      = 4,
+            ncclUint64     = 5,
+            ncclFloat16    = 6, ncclHalf       = 6,
+            ncclFloat32    = 7, ncclFloat      = 7,
+            ncclFloat64    = 8, ncclDouble     = 8,
+            ncclBfloat16   = 9,
+            ncclFloat8e4m3 = 10,
+            ncclFloat8e5m2 = 11,
+            ncclNumTypes   = 12
 } ncclDataType_t;
 
 /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
 typedef enum {
-  /* ncclScalarDevice: The scalar is in device-visible memory and will be
-   * dereferenced while the collective is running. */
-  ncclScalarDevice = 0,
+/* ncclScalarDevice: The scalar is in device-visible memory and will be
+* dereferenced while the collective is running. */
+ncclScalarDevice = 0,
 
-  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
-   * dereferenced before the ncclRedOpCreate***() function returns. */
-  ncclScalarHostImmediate = 1
+/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+* dereferenced before the ncclRedOpCreate***() function returns. */
+ncclScalarHostImmediate = 1
 } ncclScalarResidence_t;
 
 /*
- * ncclRedOpCreatePreMulSum
- *
- * Creates a new reduction operator which pre-multiplies input values by a given
- * scalar locally before reducing them with peer values via summation. For use
- * only with collectives launched against *comm* and *datatype*. The
- * *residence* argument indicates how/when the memory pointed to by *scalar*
- * will be dereferenced. Upon return, the newly created operator's handle
- * is stored in *op*.
- */
+* ncclRedOpCreatePreMulSum
+*
+* Creates a new reduction operator which pre-multiplies input values by a given
+* scalar locally before reducing them with peer values via summation. For use
+* only with collectives launched against *comm* and *datatype*. The
+* *residence* argument indicates how/when the memory pointed to by *scalar*
+* will be dereferenced. Upon return, the newly created operator's handle
+* is stored in *op*.
+*/
 ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
 ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
 
 /*
- * ncclRedOpDestroy
- *
- * Destroys the reduction operator *op*. The operator must have been created by
- * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
- * destroyed as soon as the last NCCL function which is given that operator returns.
- */
+* ncclRedOpDestroy
+*
+* Destroys the reduction operator *op*. The operator must have been created by
+* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+* destroyed as soon as the last NCCL function which is given that operator returns.
+*/
 ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
 ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
 
 /*
- * Collective communication operations
- *
- * Collective communication operations must be called separately for each
- * communicator in a communicator clique.
- *
- * They return when operations have been enqueued on the CUDA stream.
- *
- * Since they may perform inter-CPU synchronization, each call has to be done
- * from a different thread or process, or need to use Group Semantics (see
- * below).
- */
+* Collective communication operations
+*
+* Collective communication operations must be called separately for each
+* communicator in a communicator clique.
+*
+* They return when operations have been enqueued on the CUDA stream.
+*
+* Since they may perform inter-CPU synchronization, each call has to be done
+* from a different thread or process, or need to use Group Semantics (see
+* below).
+*/
 
 /*
- * Reduce
- *
- * Reduces data arrays of length count in sendbuff into recvbuff using op
- * operation.
- * recvbuff may be NULL on all calls except for root device.
- * root is the rank (not the CUDA device) where data will reside after the
- * operation is complete.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* Reduce
+*
+* Reduces data arrays of length count in sendbuff into recvbuff using op
+* operation.
+* recvbuff may be NULL on all calls except for root device.
+* root is the rank (not the CUDA device) where data will reside after the
+* operation is complete.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
 ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 
 /*
- * (deprecated) Broadcast (in-place)
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * This operation is implicitely in place.
- */
+* (deprecated) Broadcast (in-place)
+*
+* Copies count values from root to all other devices.
+* root is the rank (not the CUDA device) where data resides before the
+* operation is started.
+*
+* This operation is implicitely in place.
+*/
 ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 
 /*
- * Broadcast
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* Broadcast
+*
+* Copies count values from root to all other devices.
+* root is the rank (not the CUDA device) where data resides before the
+* operation is started.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
 ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 
 /*
- * All-Reduce
- *
- * Reduces data arrays of length count in sendbuff using op operation, and
- * leaves identical copies of result on each recvbuff.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* All-Reduce
+*
+* Reduces data arrays of length count in sendbuff using op operation, and
+* leaves identical copies of result on each recvbuff.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
 ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 
 /*
- * Reduce-Scatter
- *
- * Reduces data in sendbuff using op operation and leaves reduced result
- * scattered over the devices so that recvbuff on rank i will contain the i-th
- * block of the result.
- * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
- * should have a size of at least nranks*recvcount elements.
- *
- * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
- */
+* Reduce-Scatter
+*
+* Reduces data in sendbuff using op operation and leaves reduced result
+* scattered over the devices so that recvbuff on rank i will contain the i-th
+* block of the result.
+* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+* should have a size of at least nranks*recvcount elements.
+*
+* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+*/
 ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
     cudaStream_t stream);
@@ -413,101 +437,101 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
     cudaStream_t stream);
 
 /*
- * All-Gather
- *
- * Each device gathers sendcount values from other GPUs into recvbuff,
- * receiving data from rank i at offset i*sendcount.
- * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
- * should have a size of at least nranks*sendcount elements.
- *
- * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
- */
+* All-Gather
+*
+* Each device gathers sendcount values from other GPUs into recvbuff,
+* receiving data from rank i at offset i*sendcount.
+* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+* should have a size of at least nranks*sendcount elements.
+*
+* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+*/
 ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 
 /*
- * Send
- *
- * Send data from sendbuff to rank peer.
- *
- * Rank peer needs to call ncclRecv with the same datatype and the same count from this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
+* Send
+*
+* Send data from sendbuff to rank peer.
+*
+* Rank peer needs to call ncclRecv with the same datatype and the same count from this
+* rank.
+*
+* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+* ncclGroupEnd section.
+*/
 ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 
 /*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
- *
- * Rank peer needs to call ncclSend with the same datatype and the same count to this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
+* Receive
+*
+* Receive data from rank peer into recvbuff.
+*
+* Rank peer needs to call ncclSend with the same datatype and the same count to this
+* rank.
+*
+* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+* ncclGroupEnd section.
+*/
 ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 
 /*
- * Group semantics
- *
- * When managing multiple GPUs from a single thread, and since NCCL collective
- * calls may perform inter-CPU synchronization, we need to "group" calls for
- * different ranks/devices into a single call.
- *
- * Grouping NCCL calls as being part of the same collective operation is done
- * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
- * collective calls until the ncclGroupEnd call, which will wait for all calls
- * to be complete. Note that for collective communication, ncclGroupEnd only
- * guarantees that the operations are enqueued on the streams, not that
- * the operation is effectively done.
- *
- * Both collective communication and ncclCommInitRank can be used in conjunction
- * of ncclGroupStart/ncclGroupEnd, but not together.
- *
- * Group semantics also allow to fuse multiple operations on the same device
- * to improve performance (for aggregated collective calls), or to permit
- * concurrent progress of multiple send/receive operations.
- */
+* Group semantics
+*
+* When managing multiple GPUs from a single thread, and since NCCL collective
+* calls may perform inter-CPU synchronization, we need to "group" calls for
+* different ranks/devices into a single call.
+*
+* Grouping NCCL calls as being part of the same collective operation is done
+* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+* collective calls until the ncclGroupEnd call, which will wait for all calls
+* to be complete. Note that for collective communication, ncclGroupEnd only
+* guarantees that the operations are enqueued on the streams, not that
+* the operation is effectively done.
+*
+* Both collective communication and ncclCommInitRank can be used in conjunction
+* of ncclGroupStart/ncclGroupEnd, but not together.
+*
+* Group semantics also allow to fuse multiple operations on the same device
+* to improve performance (for aggregated collective calls), or to permit
+* concurrent progress of multiple send/receive operations.
+*/
 
 /*
- * Group Start
- *
- * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
- * a single NCCL operation. Nothing will be started on the CUDA stream until
- * ncclGroupEnd.
- */
+* Group Start
+*
+* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+* a single NCCL operation. Nothing will be started on the CUDA stream until
+* ncclGroupEnd.
+*/
 ncclResult_t  ncclGroupStart();
 ncclResult_t pncclGroupStart();
 
 /*
- * Group End
- *
- * End a group call. Start a fused NCCL operation consisting of all calls since
- * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
- * need to be called after ncclGroupEnd.
- */
+* Group End
+*
+* End a group call. Start a fused NCCL operation consisting of all calls since
+* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+* need to be called after ncclGroupEnd.
+*/
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();
 
 /*
- * Group Simulate End
- *
- * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
- */
+* Group Simulate End
+*
+* Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
+*/
 ncclResult_t  ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
 ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
 
diff --git a/src/scale.cc b/src/scale.cc
new file mode 100644
index 0000000..c435869
--- /dev/null
+++ b/src/scale.cc
@@ -0,0 +1,673 @@
+#include "alloc.h"
+#include "bootstrap.h"
+#include "channel.h"
+#include "checks.h"
+#include "coll_net.h"
+#include "enqueue.h"
+#include "graph.h"
+#include "graph/topo.h"
+#include "group.h"
+#include "nccl.h"
+#include "nccl_common.h"
+#include "serialize.h"
+#include "transport.h"
+#include "tuner.h"
+#include "lighthouse.h"
+#include <cassert>
+#include <cstdlib>
+#include <scale.h>
+
+#include "argcheck.h"
+#include "bootstrap.h"
+#include "channel.h"
+#include "coll_net.h"
+#include "enqueue.h"
+#include "gdrwrap.h"
+#include "graph.h"
+#include "graph/topo.h"
+#include "group.h"
+#include "nccl.h"
+#include "net.h"
+#include "nvmlwrap.h"
+#include "param.h"
+#include "transport.h"
+#include "tuner.h"
+#include <assert.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#define DEFAULT_LL_BUFFSIZE                                                    \
+  (NCCL_LL_LINES_PER_THREAD * NCCL_LL_MAX_NTHREADS * NCCL_STEPS *              \
+   sizeof(union ncclLLFifoLine))
+#define DEFAULT_LL128_BUFFSIZE                                                 \
+  (NCCL_LL128_ELEMS_PER_THREAD * NCCL_LL128_MAX_NTHREADS * NCCL_STEPS *        \
+   sizeof(uint64_t))
+#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
+#define BOOTSTRAP_TAG_ADD_RANK (0x1 << 27)
+
+NCCL_PARAM(BuffSize1, "BUFFSIZE", -2);
+NCCL_PARAM(LlBuffSize1, "LL_BUFFSIZE", -2);
+NCCL_PARAM(Ll128BuffSize1, "LL128_BUFFSIZE", -2);
+
+NCCL_PARAM(P2pNetChunkSize1, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
+NCCL_PARAM(P2pPciChunkSize1, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
+NCCL_PARAM(P2pNvlChunkSize1, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
+
+struct ncclCommAddRankAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t comm;
+  // for ncclCommAddNewRank & ncclCommInitNewRank
+  // ncclNewRankInfo *newRankInfo;
+  ncclCommTrans *newRankCommTrans;
+  // for ncclCommInitNewRank
+  struct ncclCommInfoInternal *peerInfo;
+  ncclUniqueId *commId;
+};
+#define BOOTSTRAP_HANDLE(h, i)                                                 \
+  ((struct ncclBootstrapHandle *)((char *)h + i * NCCL_UNIQUE_ID_BYTES))
+
+void print_socket_info(int fd) {
+  struct sockaddr_in local_addr, peer_addr;
+  socklen_t addr_len = sizeof(struct sockaddr_in);
+  char ip_str[INET_ADDRSTRLEN];
+
+  // 获取本地地址信息
+  if (getsockname(fd, (struct sockaddr *)&local_addr, &addr_len) == -1) {
+    perror("getsockname failed");
+    return;
+  }
+
+  // 转换IP地址为字符串格式
+  inet_ntop(AF_INET, &local_addr.sin_addr, ip_str, INET_ADDRSTRLEN);
+  printf("本地地址: %s\n", ip_str);
+  printf("本地端口: %d\n", ntohs(local_addr.sin_port));
+
+  // 获取远程地址信息
+  if (getpeername(fd, (struct sockaddr *)&peer_addr, &addr_len) == -1) {
+    perror("getpeername failed");
+    return;
+  }
+
+  // 转换IP地址为字符串格式
+  inet_ntop(AF_INET, &peer_addr.sin_addr, ip_str, INET_ADDRSTRLEN);
+  printf("远程地址: %s\n", ip_str);
+  printf("远程端口: %d\n", ntohs(peer_addr.sin_port));
+}
+// 检查 fd 的阻塞状态
+void check_fd_blocking(int fd) {
+  // 1. 获取 fd 的文件状态标志
+  int flags = fcntl(fd, F_GETFL);
+  if (flags == -1) { // 调用失败(如 fd 无效)
+    perror("fcntl(F_GETFL) failed");
+    return;
+  }
+
+  // 2. 判断 O_NONBLOCK 标志是否存在
+  if (flags & O_NONBLOCK) {
+    printf("fd = %d: 非阻塞模式(Non-Blocking)\n", fd);
+  } else {
+    printf("fd = %d: 阻塞模式(Blocking)\n", fd);
+  }
+}
+
+#define TIMER_INIT_TOTAL 0
+#define TIMER_INIT_KERNELS 1
+#define TIMER_INIT_BOOTSTRAP 2
+#define TIMER_INIT_ALLGATHER 3
+#define TIMER_INIT_TOPO 4
+#define TIMER_INIT_GRAPHS 5
+#define TIMER_INIT_CONNECT 6
+#define TIMER_INIT_ALLOC 7
+#define TIMERS_INIT_COUNT 8
+
+static ncclResult_t ncclCommInitNewRankFunc(struct ncclAsyncJob *job_) {
+  uint64_t timers[TIMERS_INIT_COUNT] = {0};
+  ncclResult_t result = ncclSuccess;
+  struct ncclCommAddRankAsyncJob *job = (struct ncclCommAddRankAsyncJob *)job_;
+  ncclComm_t comm = job->comm;
+  union ncclSocketAddress *newRankAddr;
+  int cudaDev = comm->cudaDev;
+  // ncclNewRankInfo *newRankInfo = job->newRankInfo;
+  // ncclComm_t peerCommInfo = job->peerInfo->comm; ncclComm_t 是 ncclComm
+  // 的指针 ncclCommTrans* peerCommInfo = job->newRankCommTrans;
+  int nRanks = comm->nRanks;
+  int myRank = nRanks - 1;
+  size_t maxLocalSizeBytes = 0;
+  int cudaArch;
+  int archMajor, archMinor;
+  unsigned long long commIdHash;
+  struct bootstrapState *state;
+  int maxSharedMem = 0;
+  int rank = nRanks - 1;
+  uint64_t bootstrapTime;
+  uint64_t magic;
+  struct LhTxn* lhTxn = NULL;
+  struct LhState* lhState = NULL;
+  union ncclSocketAddress nextRankAddr;
+  // NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), result, fail);
+  // memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
+  timers[TIMER_INIT_TOTAL] = clockNano();
+  timers[TIMER_INIT_BOOTSTRAP] = clockNano();
+  NCCLCHECK(ncclCalloc(&state, 1));
+  comm->bootstrap = state;
+  state->abortFlag = comm->abortFlag;
+  state->magic = comm->magic = magic;
+  // comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic;
+  newRankAddr =
+      (union ncclSocketAddress *)malloc(sizeof(union ncclSocketAddress));
+  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket),
+                               newRankAddr, ncclSocketTypeBootstrap));
+  // NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state,
+  // socket), newRankAddr, ncclSocketTypeBootstrap));
+
+  if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+    fprintf(stderr, "lighthouse: txnBegin failed");
+    result = ncclInternalError;
+    goto fail;
+  }
+  if (txnLoad(lhTxn, &lhState) != 0) {
+    fprintf(stderr, "lighthouse: txnLoad failed");
+    result = ncclInternalError;
+    goto fail;
+  }
+  getMagic(lhState, &magic);
+  setNewRank(lhState, newRankAddr, rank);
+  if (queryNextRankAddrNew(lhState, &nextRankAddr) != 0) {
+    fprintf(stderr, "lighthouse: queryNextRankAddrNew failed");
+    result = ncclInternalError;
+    goto fail;
+  }
+  updateVersion(lhState);
+  printLhState(lhState);
+  if (txnSave(lhTxn, lhState) != 0) {
+    fprintf(stderr, "lighthouse: txnSave failed");
+    result = ncclInternalError;
+    goto fail;
+  }
+  if (txnEnd(lhTxn) != 0) {
+    fprintf(stderr, "lighthouse: txnEnd failed");
+    result = ncclInternalError;
+    goto fail;
+  }
+
+  state->magic = comm->magic = magic;
+  INFO(NCCL_INIT, "Rank %d is waiting for connection from prev rank %d...\n", rank, rank - 1);
+  INFO(NCCL_INIT, "Rank %d is connecting to next rank %d...\n", rank, 0);
+  NCCLCHECK(socketRingConnect(
+    &nextRankAddr, &STATE_RING(state, socket.send),
+    &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv),
+    comm->magic,
+    state->abortFlag));
+  INFO(NCCL_INIT, "Rank %d is connected...\n", rank - 1);
+  INFO(NCCL_INIT, "Rank %d is connected...\n", 0);
+  timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
+  timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+  CUDACHECKGOTO(cudaSetDevice(cudaDev), result, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(
+                    &archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev),
+                result, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(
+                    &archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev),
+                result, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem,
+                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                       cudaDev),
+                result, fail);
+  cudaArch = 100 * archMajor + 10 * archMinor;
+  timers[TIMER_INIT_KERNELS] = clockNano();
+
+  NCCLCHECK(
+      ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
+  // Set the maximum kernel stack size of all kernels to avoid
+  // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
+  if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
+    TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes);
+    CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
+  }
+  timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
+  ncclSocketRecv(&STATE_RING(state, socket.send), &(comm->commHash),
+                 sizeof(uint64_t));
+  //
+  // 1. 计算所需buffer大小
+  state->nranks = comm->nRanks;
+  // newBufferSize = ncclInfoSerializeSize(comm);
+  // comm->nRanks = comm->nRanks - 1;
+  // state->nranks = comm->nRanks;
+  // oldBufferSize = ncclInfoSerializeSize(comm);
+  // comm->nRanks = comm->nRanks + 1;
+  // state->nranks = comm->nRanks;
+  // // 2. 按需分配buffer(精确大小)
+  // buffer = new char[oldBufferSize];
+  // check_fd_blocking(STATE_RING(state, socket.send).fd);
+  // check_fd_blocking(STATE_RING(state, socket.recv).fd);
+  // print_socket_info(STATE_RING(state, socket.send).fd);
+  // print_socket_info(STATE_RING(state, socket.recv).fd);
+  // ncclSocketRecv(&STATE_RING(state, socket.send),buffer,oldBufferSize);
+  // ncclComm* newComm = new ncclComm();
+  // memset(newComm, 0, sizeof(ncclComm)); // 确保指针初始为null
+  // 2. 反序列化(内部会按需分配内存)
+  // INFO(NCCL_INIT, "ncclInfoDeserialize before:");
+  // ncclInfoDeserialize((struct ncclCommTrans *)buffer);
+  // INFO(NCCL_INIT, "ncclInfoDeserialize after:");
+  // obtain a unique hash using the first commId
+  // comm->commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  // commId = job->commId;
+  // comm->commHash = commIdHash = getHash(commId->internal,
+  // NCCL_UNIQUE_ID_BYTES);
+  commIdHash = comm->commHash;
+  nRanks = comm->nRanks; // 对的对的对的;统一放在某个位置
+  myRank = comm->nRanks - 1;
+  comm->rank = myRank;
+  INFO(NCCL_INIT,
+       "%s comm %p rank %d nRanks %d cudaDev %d nvmlDev %d busId %lx commId "
+       "0x%llx - Init START",
+       __func__, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev,
+       comm->busId, commIdHash);
+  timers[TIMER_INIT_ALLOC] = clockNano();
+  NCCLCHECKGOTO(commAlloc(comm, NULL, nRanks, myRank), result,
+                fail); // 这里可能有if else
+  timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+  bootstrapTime = clockNano();
+  NCCLCHECKGOTO(bootstrapInitNew(comm, true), result, fail);
+  bootstrapTime = clockNano() - bootstrapTime;
+  timers[TIMER_INIT_BOOTSTRAP] = timers[TIMER_INIT_BOOTSTRAP] + bootstrapTime;
+  comm->cudaArch = cudaArch;
+
+  // timers[TIMER_INIT_ALLGATHER] = clockNano();
+  NCCLCHECKGOTO(initTransportsRank(comm, nullptr, timers), result, fail);
+  // NCCLCHECKGOTO(initTransportsRank(comm, (struct ncclCommTrans*)buffer),
+  // result, fail); timers[TIMER_INIT_ALLGATHER] = clockNano() -
+  // timers[TIMER_INIT_ALLGATHER];
+
+  NCCLCHECKGOTO(ncclTunerPluginLoad(comm), result, fail);
+  if (comm->tuner) {
+    NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog,
+                                &comm->tunerContext));
+  }
+  comm->initState = ncclSuccess;
+
+  // // 2. 按需分配buffer(精确大小)
+  // buffer = new char[newBufferSize];
+  // // 3. 序列化
+  // ncclInfoSerialize(buffer, comm);
+  // ncclSocketSend(&STATE_RING(state, socket.send),buffer,newBufferSize);
+  timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL];
+
+  INFO(NCCL_INIT | NCCL_PROFILE,
+       "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc "
+       "%.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
+       "connections %.2f, rest %.2f)",
+       "ncclCommInitNewRankFunc", comm->rank, comm->nRanks,
+       timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9,
+       timers[TIMER_INIT_ALLOC] / 1e9, timers[TIMER_INIT_BOOTSTRAP] / 1e9,
+       timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
+       timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9,
+       timers[TIMER_INIT_TOTAL] / 1e9);
+
+exit:
+  return result;
+fail:
+  comm->initState = result;
+  if (lhTxn)
+    free(lhTxn);
+  if (lhState)
+    free(lhState);
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitNewRank, ncclComm_t *comm,
+         int nRanks);
+ncclResult_t ncclCommInitNewRank(ncclComm_t *newcomm,
+                                 int nRanks) {
+  ncclResult_t result = ncclSuccess;
+  // ncclCommInfoInternal *peerInfo = (ncclCommInfoInternal
+  // *)commInfo->internal;
+  int cudaDev = -1;
+  ncclComm_t comm = NULL;
+
+  // ncclInfoDeserialize(peerInfo);
+  // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+  (void)ncclCudaLibraryInit();
+
+  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+  CUDACHECKGOTO(cudaGetDevice(&cudaDev), result, fail);
+  // first call ncclInit, this will setup the environment
+  NCCLCHECKGOTO(ncclInit(), result, fail);
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECKGOTO(cudaFree(NULL), result, fail);
+
+  NCCLCHECKGOTO(ncclCalloc(&comm, 1), result, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), result, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), result, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), result, fail);
+  comm->startMagic = comm->endMagic =
+      NCCL_MAGIC; // Used to detect comm corruption.
+  *comm->abortFlagRefCount = 1;
+  comm->cudaDev = cudaDev;
+  NCCLCHECKGOTO(parseCommConfig(comm, &config), result, fail);
+  /* start with ncclInternalError and will be changed to ncclSuccess if init
+   * succeeds. */
+  comm->initState = ncclInternalError;
+  comm->nRanks = nRanks;
+  *newcomm = comm;
+
+  struct ncclCommAddRankAsyncJob *job;
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), result, fail);
+  job->comm = comm;
+  // job->newRankInfo = newRankInfo;
+  // job->peerInfo = peerInfo;
+  NCCLCHECKGOTO(
+      ncclAsyncLaunch(&job->base, ncclCommInitNewRankFunc, NULL, free, comm),
+      result, fail);
+
+exit:
+  return ncclGroupErrCheck(result);
+fail:
+  if (comm) {
+    free(comm->abortFlag);
+    if (comm->abortFlagDev)
+      (void)ncclCudaHostFree((void *)comm->abortFlagDev);
+    free(comm->abortFlagRefCount);
+    free(comm);
+  }
+  if (newcomm)
+    *newcomm = NULL;
+  goto exit;
+}
+
+
+static ncclResult_t computeBuffSizes(struct ncclComm *comm) {
+  int64_t envs[NCCL_NUM_PROTOCOLS] = {
+      ncclParamLlBuffSize1(), ncclParamLl128BuffSize1(), ncclParamBuffSize1()};
+  int defaults[NCCL_NUM_PROTOCOLS] = {DEFAULT_LL_BUFFSIZE,
+                                      DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE};
+
+  for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
+    comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
+  }
+
+  if (comm->nNodes > 1)
+    comm->p2pChunkSize = ncclParamP2pNetChunkSize1();
+  else if (comm->isAllNvlink)
+    comm->p2pChunkSize = ncclParamP2pNvlChunkSize1();
+  else
+    comm->p2pChunkSize = ncclParamP2pPciChunkSize1();
+
+  // Make sure P2P chunksize is not larger than coll chunksize.
+  if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE])
+    comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+
+  if (comm->sharedRes->owner != comm) {
+    /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
+    comm->p2pChunkSize =
+        std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
+  } else {
+    comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize;
+  }
+
+  INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize);
+  return ncclSuccess;
+}
+struct NvtxParamsCommInitRank {
+  int rank;
+  int nRanks;
+  int cudaDev;
+};
+constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0,
+     offsetof(NvtxParamsCommInitRank, nRanks)},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0,
+     offsetof(NvtxParamsCommInitRank, cudaDev)},
+};
+struct graphInfo {
+  int pattern;
+  int nChannels;
+  int sameChannels;
+  float bwIntra;
+  float bwInter;
+  int typeIntra;
+  int typeInter;
+  int crossNic;
+};
+struct allGatherInfo {
+  struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+  struct ncclTopoRanks topoRanks;
+  int cpuArch;
+  int cpuVendor;
+};
+
+
+void broadcast(ncclComm_t comm, int my_rank, int num_ranks, void *data,
+               int data_size) {
+  // 如果是0号进程,一开始就有数据,不需要接收
+  if (my_rank != 0) {
+    // 初始化数据缓冲区(实际应用中可能需要更复杂的处理)
+    // 这里假设data指向已分配的缓冲区
+  }
+
+  // 计算需要的轮数:最大rank编号的二进制位数
+  int max_rank = num_ranks - 1;
+  int rounds = 0;
+  while ((1 << rounds) <= max_rank) {
+    rounds++;
+  }
+
+  // 执行每一轮的通信
+  for (int round = 0; round < rounds; round++) {
+    // 当前轮次的步长:2^round
+    int step = 1 << round;
+
+    // 发送方:rank < step的进程在本轮可能需要发送数据
+    if (my_rank < step) {
+      int dest = my_rank + step;
+      // 检查目标rank是否有效
+      if (dest < num_ranks) {
+        // 发送数据到目标rank
+        bootstrapSend(comm->bootstrap, dest, BOOTSTRAP_TAG_ADD_RANK, data,
+                      data_size);
+        // 可以在这里添加调试信息
+        // printf("Round %d: rank %d sends to rank %d\n", round + 1, my_rank,
+        // dest);
+      }
+    }
+
+    // 接收方:rank >= step且rank - step < step的进程在本轮需要接收数据
+    if (my_rank >= step && (my_rank - step) < step) {
+      int source = my_rank - step;
+      // 从源rank接收数据
+      bootstrapRecv(comm->bootstrap, source, BOOTSTRAP_TAG_ADD_RANK, data,
+                    data_size);
+      // 可以在这里添加调试信息
+      // printf("Round %d: rank %d receives from rank %d\n", round + 1, my_rank,
+      // source);
+    }
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommAddNewRank, ncclComm_t comm);
+ncclResult_t
+ncclCommAddNewRank(ncclComm_t comm) {
+  ncclResult_t result = ncclSuccess;
+  // ncclNewRankInfoInternal *info = (ncclNewRankInfoInternal
+  // *)newRankInfo->internal; ncclInfoDeserialize(info); INFO(NCCL_INIT, "Adding
+  // new rank %d to comm %p with peerInfo", comm->rank, comm);
+  struct ncclCommAddRankAsyncJob *job;
+  struct bootstrapState *state = (struct bootstrapState *)comm->bootstrap;
+  uint64_t expectedVersion = 4;
+  struct LhTxn* lhTxn = NULL;
+  struct LhState* lhState = NULL;
+  union ncclSocketAddress nextRankAddr;
+  uint64_t timers[TIMERS_INIT_COUNT] = {0};
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  comm->nRanks++;
+  // size_t oldBufferSize;
+  // size_t newBufferSize;
+  // // 1. 计算所需buffer大小
+  // // 2. 按需分配buffer(精确大小)
+  // char* buffer;//全局唯一也可以
+  // oldBufferSize = ncclInfoSerializeSize(comm);
+  // comm->nRanks = comm->nRanks + 1;
+  // state->nranks = comm->nRanks;
+  // newBufferSize = ncclInfoSerializeSize(comm);// 分配多了,更好
+  // comm->nRanks = comm->nRanks - 1;//必须减
+  // state->nranks = comm->nRanks;
+  // buffer = new char[oldBufferSize];
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), result, fail);
+  job->comm = comm; // 也会有一次拷贝
+  // job->newRankInfo = newRankInfo;
+
+  if (rank == nranks - 1) {
+    printf("Rank %d is waiting for lighthouse version %lu...\n", rank, expectedVersion);
+    if (txnWaitForVersion(LH_STATE_PATH, expectedVersion, /*timeout_ms*/-1) != 0) {
+      fprintf(stderr, "lighthouse: txnWaitForVersion failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+    if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+      fprintf(stderr, "lighthouse: txnBegin failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+    if (txnLoad(lhTxn, &lhState) != 0) {
+      fprintf(stderr, "lighthouse: txnLoad failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+    uint64_t version;
+    getVersion(lhState, &version);
+    if (version != expectedVersion) {
+      fprintf(stderr, "lighthouse: version mismatch, expected %lu but got %lu\n", expectedVersion, version);
+      result = ncclInternalError;
+      goto fail;
+    }
+    if (queryNextRankAddrLast(lhState, &nextRankAddr) != 0) {
+      fprintf(stderr, "lighthouse: queryNextRankAddrLast failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+    printLhState(lhState);
+    if (txnSave(lhTxn, lhState) != 0) {
+      fprintf(stderr, "lighthouse: txnSave failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+    if (txnEnd(lhTxn) != 0) {
+      fprintf(stderr, "lighthouse: txnEnd failed");
+      result = ncclInternalError;
+      goto fail;
+    }
+  }
+
+  if (rank == 0) {
+    INFO(NCCL_INIT, "Rank %d is waiting for connection from prev rank %d...\n", rank, comm->nRanks - 1);
+    NCCLCHECK(socketRingConnectNext(&STATE_LISTEN(state, socket),
+      &STATE_RING(state, socket.recv)));
+    INFO(NCCL_INIT, "Rank %d is connected...\n", comm->nRanks - 1);
+  }
+  if (rank == nranks - 1) {
+    INFO(NCCL_INIT, "Rank %d is connecting to next rank %d...\n", rank, comm->nRanks - 1);
+    NCCLCHECK(socketRingConnectPrev(&nextRankAddr,
+      &STATE_RING(state, socket.send),
+      comm->magic, state->abortFlag));
+    INFO(NCCL_INIT, "Rank %d is connected...\n", comm->nRanks - 1);
+  }
+  
+  if (comm->rank == 0) {
+    ncclSocketSend(&STATE_RING(state, socket.recv), &(comm->commHash),
+                   sizeof(uint64_t));
+  }
+  NCCLCHECKGOTO(commAllocNew(comm, nullptr, comm->nRanks, comm->rank), result,
+                fail);
+  NCCLCHECKGOTO(bootstrapInitNew(comm, false), result, fail);
+  NCCLCHECKGOTO(updateTransportsRank(comm, nullptr, timers), result, fail);
+
+ret:
+  return result;
+fail:
+  if (lhTxn)
+    free(lhTxn);
+  if (lhState)
+    free(lhState);
+  goto ret;
+}
+
+static ncclResult_t ncclCommSetupNewRankFunc(struct ncclAsyncJob *job_) {
+  ncclResult_t res = ncclSuccess;
+  struct ncclCommAddRankAsyncJob *job = (struct ncclCommAddRankAsyncJob *)job_;
+  ncclComm_t comm = job->comm;
+  int cudaDev = comm->cudaDev;
+  int rank = comm->rank;
+  INFO(NCCL_INIT, "Rank %d magic %lu commHash %lu", rank, comm->magic,
+       comm->commHash);
+  INFO(NCCL_INIT, "ncclCommSetupNewRankFunc");
+  CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
+  INFO(NCCL_INIT, "then ncclCommSetupNewRankFunc");
+
+  INFO(NCCL_INIT, "6668888888ncclCommSetupNewRankFunc");
+
+  printf("Rank %d is waiting to receive 1MB data...\n", rank);
+
+  // if (comm->rank != nRanks-1){
+  for (int i = 1; i < comm->nRanks; i++) {
+    // int bootstrapTag = (i << 8) + (&comm->graphs[NCCL_ALGO_RING] ?
+    // (&comm->graphs[NCCL_ALGO_RING])->id + 1 : 0);
+    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+    int sendPeer = (comm->rank + i) % comm->nRanks;
+    comm->connectRecv[recvPeer] = 0x0;
+    comm->connectSend[sendPeer] = 0x0;
+    uint64_t recvMask = comm->connectRecv[recvPeer];
+    uint64_t sendMask = comm->connectSend[sendPeer];
+    // INFO(NCCL_INIT,"十六进制(小写): 0x%" PRIx64 "\n", recvMask);
+    INFO(NCCL_INIT,
+         "waini send i %d:两个十六进制值: 0x%" PRIx64 " 0x%" PRIx64 "\n", i,
+         recvMask, sendMask);
+  }
+// for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+//   ops = proxyOps + r;
+//   //INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps
+//   %d",ops->pool,ops->nextOps);
+
+//   //ops->pool = nullptr;
+//   ops->nextOps = 0;
+//   comm->nChannels = 2;
+
+// }
+// }
+// NCCLCHECKGOTO(ncclTransportRingConnectNew(comm), res, fail);
+exit:
+  return res;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommSetupNewRank, ncclComm_t comm);
+ncclResult_t ncclCommSetupNewRank(ncclComm_t comm) {
+  ncclResult_t res = ncclSuccess;
+  struct ncclCommAddRankAsyncJob *job;
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(
+      ncclAsyncLaunch(&job->base, ncclCommSetupNewRankFunc, NULL, free, comm),
+      res, fail);
+
+exit:
+  return res;
+fail:
+  goto exit;
+}
-- 
2.43.0