From 7e67a9d18ce01dec4917319ef9ff4a9fc70d8461 Mon Sep 17 00:00:00 2001
From: Eusford_0526 <sunzijian4@huawei.com>
Date: Mon, 12 Jan 2026 17:15:54 +0800
Subject: [PATCH] local reconstruction via lighthouse
src/Makefile | 2 +-
src/bootstrap.cc | 475 +++++++--
src/channel.cc | 86 +-
src/graph/connect.cc | 1013 ++++++++++---------
src/include/bootstrap.h | 74 +-
src/include/channel.h | 1 +
src/include/comm.h | 46 +-
src/include/lighthouse.h | 34 +
src/include/scale.h | 55 +
src/include/serialize.h | 507 ++++++++++
src/init.cc | 2079 +++++++++++++++++++++++++++++++++-----
src/lighthouse.cc | 339 +++++++
src/misc/shmutils.cc | 22 +-
src/misc/socket.cc | 1960 +++++++++++++++++------------------
src/nccl.h.in | 524 +++++-----
src/scale.cc | 673 ++++++++++++
16 files changed, 5778 insertions(+), 2112 deletions(-)
create mode 100644 src/include/lighthouse.h
create mode 100644 src/include/scale.h
create mode 100644 src/include/serialize.h
create mode 100644 src/lighthouse.cc
create mode 100644 src/scale.cc
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
INCEXPORTS := nccl.h
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
- init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+ init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc scale.cc lighthouse.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc) \
@@ -1,14 +1,15 @@
/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#include "nccl.h"
#include "core.h"
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
+#include "lighthouse.h"
#include <unistd.h>
#include <sys/types.h>
#include "proxy.h"
@@ -42,11 +43,33 @@
} while (0)
#define BOOTSTRAP_PID(i, n) (((i) + (n)) % (n))
+
+// struct ncclCommTrans {
+// struct ncclTopoRanks* peerTopo;//长度nRanks
+// struct ncclPeerInfo* peerInfo;
+// int* nodesFirstRank;//长度nRanks
+// int* nodesTreePatterns;//长度nRanks
+// int* ringPrev;//长度nRanks*MAXCHANNELS
+// int* ringNext;//长度nRanks*MAXCHANNELS
+// int* peerRings;//长度nRanks*MAXCHANNELS
+// void* bootstrap;
+// int nRanks; // number of GPUs in communicator
+// int cudaDev;
+// int* rankToNode;
+// struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
// returns the first rank associated to the root. must have root >=0
// if root >= n_roots, it does NOT assume periodicity
static int firstRankFromRoot(int root, int n_ranks, int nRoots) {
return root * (n_ranks / nRoots) + std::min(root, n_ranks % nRoots);
}
+
+// typedef struct {
+// int socket_fd;
+// int rank_id;
+// char ip_address[INET_ADDRSTRLEN];
+// } RankConnection;
// returns the root of a rank, must have rank >=0
// if rank >= n_ranks, it does NOT assume periodicity
static int rootIdFromRank(int rank, int nRanks, int nRoots) {
@@ -79,6 +102,7 @@ static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
struct bootstrapRootArgs {
struct ncclSocket* listenSock;
uint64_t magic;
+ int sock;
};
/* Init functions */
@@ -103,7 +127,7 @@ ncclResult_t bootstrapNetInit() {
return ncclInvalidArgument;
}
NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
- &nIfs));
+ &nIfs));
if (nIfs <= 0) {
WARN("NET/Socket : No usable listening interface found");
pthread_mutex_unlock(&bootstrapNetLock);
@@ -128,6 +152,7 @@ ncclResult_t bootstrapNetInit() {
return ncclSuccess;
}
+
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
@@ -153,7 +178,7 @@ static ncclResult_t netDereg(ncclNet_t* net, void* comm, void** handle) {
return ncclSuccess;
}
static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int size, void* dataHandle, int tag, void** sendReq,
- int* done) {
+ int* done) {
if (*done) return ncclSuccess;
if (!*sendReq) {
NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
@@ -167,7 +192,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
return ncclSuccess;
}
static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int size, void* dataHandle, int tag, void** recvReq,
- int* done) {
+ int* done) {
if (*done) return ncclSuccess;
if (!*recvReq) {
size_t size64 = size;
@@ -218,7 +243,7 @@ static ncclResult_t socketRecv(struct ncclSocket* sock, void* data, int size) {
return ncclSuccess;
}
static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock,
- void* recvData, int recvSize) {
+ void* recvData, int recvSize) {
int senderRecvSize;
NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int)));
if (senderRecvSize > recvSize) {
@@ -267,7 +292,8 @@ fail:
(void)ncclSocketClose(&sock);
return res;
}
-static void* bootstrapRoot(void* rargs) {
+void printNcclSocketAddress(union ncclSocketAddress *addr);
+static void* bootstrapRoot(void* rargs) {//这个地方可能重复
uint64_t timers[BOOTSTRAP_INIT_ROOT_N] = {0};
struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
struct ncclSocket* listenSock = args->listenSock;
@@ -278,11 +304,14 @@ static void* bootstrapRoot(void* rargs) {
int nrecv = 0, n2send = 0;
struct extInfo info;
union ringConnectInfo* rankInfo = NULL;
+ union ncclSocketAddress* nextPeerAddrInfo = NULL;
union ncclSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange
// get zeros for comparison
char zeroHandle[NCCL_NET_HANDLE_MAXSIZE];
union ncclSocketAddress zeroAddress;
union ringConnectInfo zeroInfo;
+ struct LhTxn* lhTxn = NULL;
+ struct LhState* lhState = NULL;
memset(&zeroAddress, 0, sizeof(union ncclSocketAddress));
memset(&zeroHandle, 0, NCCL_NET_HANDLE_MAXSIZE);
memset(&zeroInfo, 0, sizeof(union ringConnectInfo));
@@ -309,6 +338,7 @@ static void* bootstrapRoot(void* rargs) {
nrecv = n2send + ((nroots > 1) ? 1 : 0);
NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out);
NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
+ NCCLCHECKGOTO(ncclCalloc(&nextPeerAddrInfo, nrecv), res, out);
}
if (nranks != info.nranks || nroots != info.nroots || iroot != info.iroot) {
@@ -328,8 +358,13 @@ static void* bootstrapRoot(void* rargs) {
int prev = (nroots > 1) ? (localId - 1) : BOOTSTRAP_PID(localId - 1, nrecv);
if (prev >= 0 && prev < n2send && memcmp(&zeroAddress, &rankAddressesRoot[prev], sizeof(union ncclSocketAddress)) != 0) {
NCCLCHECKGOTO(rootSend(&rankAddressesRoot[prev], magic, &info.connectInfo), res, out);
+ memcpy(&nextPeerAddrInfo[info.rank], &info.connectInfo.addr, sizeof(union ncclSocketAddress));
} else {
memcpy(&rankInfo[localId], &info.connectInfo, sizeof(union ringConnectInfo));
+ memcpy(&nextPeerAddrInfo[info.rank], &info.connectInfo.addr, sizeof(union ncclSocketAddress));
+ //memcpy(&nextPeerAddrInfo[info.rank],&info.connectInfo.addr,sizeof(union ncclSocketAddress));//估计没问题
+ //printNcclSocketAddress(&info.connectInfo.addr);
+ //printNcclSocketAddress(&nextPeerAddrInfo[1]);
}
// if the next rank has checked in, send the newly received info, if not save the addr for later
// for nroots >=1, I will always own the information of the next connection
@@ -339,10 +374,38 @@ static void* bootstrapRoot(void* rargs) {
NCCLCHECKGOTO(rootSend(&info.listenRootAddress, magic, &rankInfo[next]), res, out);
} else {
memcpy(rankAddressesRoot + localId, &info.listenRootAddress, sizeof(union ncclSocketAddress));
+ //memcpy(&nextPeerAddrInfo[info.rank], &info.listenRootAddress, sizeof(union ncclSocketAddress));
}
++c;
TRACE(NCCL_BOOTSTRAP, "Received connect from rank %d total %d/%d", info.rank, c, nrecv);
} while (c < nrecv);
+ INFO(NCCL_INIT,"bootstrapRoot nrecv %d",nrecv);
+ printNcclSocketAddress(&nextPeerAddrInfo[0]);
+ printNcclSocketAddress(&nextPeerAddrInfo[1]);
+
+ if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnBegin failed");
+ res = ncclInternalError;
+ goto out;
+ }
+ if (txnLoad(lhTxn, &lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnLoad failed");
+ res = ncclInternalError;
+ goto out;
+ }
+ initialize(lhState, nextPeerAddrInfo, nrecv, magic);
+ printLhState(lhState);
+ if (txnSave(lhTxn, lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnSave failed");
+ res = ncclInternalError;
+ goto out;
+ }
+ if (txnEnd(lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnEnd failed");
+ res = ncclInternalError;
+ goto out;
+ }
+
TRACE(NCCL_BOOTSTRAP, "COLLECTED ALL %d HANDLES", nrecv);
BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_RECV]);
@@ -352,6 +415,7 @@ static void* bootstrapRoot(void* rargs) {
for (int r = 0; r < n2send; ++r) {
// use nrecv to periodize: if 1 root, we will send the first one to the last one, if >1 roots we will send the additional one we have received
int next = BOOTSTRAP_PID(r + 1, nrecv);
+ //printNcclSocketAddress(&rankInfo[next].addr);
if (memcmp(&zeroAddress, &rankAddressesRoot[r], sizeof(union ncclSocketAddress)) != 0 &&
memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) {
NCCLCHECKGOTO(rootSend(&rankAddressesRoot[r], magic, &rankInfo[next]), res, out);
@@ -359,6 +423,7 @@ static void* bootstrapRoot(void* rargs) {
}
BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_SEND]);
TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "Root timings (wait %f, recv %f, send %f)", timers[BOOTSTRAP_INIT_ROOT_WAIT] / 1e9, timers[BOOTSTRAP_INIT_ROOT_RECV] / 1e9, timers[BOOTSTRAP_INIT_ROOT_SEND] / 1e9);
+ INFO(NCCL_INIT,"bootstrapRoot DONE");
out:
if (listenSock != NULL) {
(void)ncclSocketClose(listenSock);
@@ -369,6 +434,10 @@ out:
if (rankAddressesRoot)
free(rankAddressesRoot);
free(rargs);
+ if (lhTxn)
+ free(lhTxn);
+ if (lhState)
+ free(lhState);
TRACE(NCCL_BOOTSTRAP, "DONE");
return NULL;
@@ -419,57 +488,57 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
return ncclSuccess;
}
-struct unexConn {
- int peer;
- int tag;
- struct ncclSocket sock;
- struct unexConn* next;
-};
-
-struct bootstrapRing_t {
- union {
- struct {
- void *sendComm, *recvComm;
- ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
- } net;
- struct {
- struct ncclSocket recv;
- struct ncclSocket send;
- } socket;
- };
-};
-struct bootstrapListen_t {
- struct ncclSocket peerSocket; // socket for peers to contact me in P2P
- union {
- struct {
- int dev;
- void* comm;
- char handle[NCCL_NET_HANDLE_MAXSIZE];
- } net;
- struct ncclSocket socket; // socket to be used for the ring
- };
-};
-
-struct bootstrapState {
- struct bootstrapRing_t ring;
- struct bootstrapListen_t listen;
- ncclNet_t* net;
- uint64_t* peerProxyAddressesUDS;
- union ncclSocketAddress* peerProxyAddresses;
- union ncclSocketAddress* peerP2pAddresses;
- struct unexConn* unexpectedConnections;
- int cudaDev;
- int rank;
- int nranks;
- uint64_t magic;
- volatile uint32_t* abortFlag;
-};
+// struct unexConn {
+// int peer;
+// int tag;
+// struct ncclSocket sock;
+// struct unexConn* next;
+// };
+
+// struct bootstrapRing_t {
+// union {
+// struct {
+// void *sendComm, *recvComm;
+// ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
+// } net;
+// struct {
+// struct ncclSocket recv;
+// struct ncclSocket send;
+// } socket;
+// };
+// };
+// struct bootstrapListen_t {
+// struct ncclSocket peerSocket; // socket for peers to contact me in P2P
+// union {
+// struct {
+// int dev;
+// void* comm;
+// char handle[NCCL_NET_HANDLE_MAXSIZE];
+// } net;
+// struct ncclSocket socket; // socket to be used for the ring
+// };
+// };
+
+// struct bootstrapState {
+// struct bootstrapRing_t ring;
+// struct bootstrapListen_t listen;
+// ncclNet_t* net;
+// uint64_t* peerProxyAddressesUDS;
+// union ncclSocketAddress* peerProxyAddresses;
+// union ncclSocketAddress* peerP2pAddresses;
+// struct unexConn* unexpectedConnections;
+// int cudaDev;
+// int rank;
+// int nranks;
+// uint64_t magic;
+// volatile uint32_t* abortFlag;
+// };
#define STATE_RING(s, f) (s->ring.f)
#define STATE_LISTEN(s, f) (s->listen.f)
// helper functions
-static ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
- ncclSocketType type) {
+ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
+ ncclSocketType type) {
NCCLCHECK(ncclSocketInit(socket, &bootstrapNetIfAddr, magic, type, comm->abortFlag));
NCCLCHECK(ncclSocketListen(socket));
NCCLCHECK(ncclSocketGetAddr(socket, addr));
@@ -536,8 +605,8 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
}
static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
- void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
- void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
+ void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
+ void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
int abortCounter = 0;
do {
@@ -549,13 +618,26 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
} while (!*sendComm || !*recvComm);
return ncclSuccess;
}
-static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) {
+ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) {
NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag));
NCCLCHECK(ncclSocketConnect(sendSocket));
NCCLCHECK(ncclSocketInit(recvSocket));
NCCLCHECK(ncclSocketAccept(recvSocket, listenSock));
return ncclSuccess;
}
+
+ncclResult_t socketRingConnectPrev(ncclSocketAddress* addr, struct ncclSocket* sendSocket, uint64_t magic, volatile uint32_t* abortFlag) {
+ NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag));
+ NCCLCHECK(ncclSocketConnect(sendSocket));
+ return ncclSuccess;
+}
+
+ncclResult_t socketRingConnectNext(struct ncclSocket* listenSock, struct ncclSocket* recvSocket) {
+ NCCLCHECK(ncclSocketInit(recvSocket));
+ NCCLCHECK(ncclSocketAccept(recvSocket, listenSock));
+ return ncclSuccess;
+}
+
static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
union ncclSocketAddress* peerAddresss,
union ncclSocketAddress* peerProxy, uint64_t* peerUDS,
@@ -619,7 +701,58 @@ NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);
NCCL_PARAM(RasEnable, "RAS_ENABLE", 1);
-ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
+// 打印联合体内容的函数
+void printNcclSocketAddress(union ncclSocketAddress *addr) {
+ if (addr == NULL) {
+ INFO(NCCL_INIT,"地址为空\n");
+ return;
+ }
+
+ // 根据地址族判断类型
+ switch (addr->sa.sa_family) {
+ case AF_INET: {
+ // IPv4 地址,使用 sin 成员
+ struct sockaddr_in *ipv4 = &addr->sin;
+ char ip_str[INET_ADDRSTRLEN];
+
+ // 转换IP地址为字符串
+ inet_ntop(AF_INET, &(ipv4->sin_addr), ip_str, INET_ADDRSTRLEN);
+
+ INFO(NCCL_INIT,"IPv4 地址信息:\n");
+ INFO(NCCL_INIT," 地址族: AF_INET\n");
+ INFO(NCCL_INIT," IP地址: %s\n", ip_str);
+ INFO(NCCL_INIT," 端口号: %d (网络字节序: 0x%x)\n",
+ ntohs(ipv4->sin_port), // 转换为主机字节序
+ ipv4->sin_port);
+ break;
+ }
+ case AF_INET6: {
+ // IPv6 地址,使用 sin6 成员
+ struct sockaddr_in6 *ipv6 = &addr->sin6;
+ char ip_str[INET6_ADDRSTRLEN];
+
+ // 转换IP地址为字符串
+ inet_ntop(AF_INET6, &(ipv6->sin6_addr), ip_str, INET6_ADDRSTRLEN);
+
+ INFO(NCCL_INIT,"IPv6 地址信息:\n");
+ INFO(NCCL_INIT," 地址族: AF_INET6\n");
+ INFO(NCCL_INIT," IP地址: %s\n", ip_str);
+ INFO(NCCL_INIT," 端口号: %d (网络字节序: 0x%x)\n",
+ ntohs(ipv6->sin6_port), // 转换为主机字节序
+ ipv6->sin6_port);
+ INFO(NCCL_INIT," 流标签: %u\n", ntohl(ipv6->sin6_flowinfo));
+ INFO(NCCL_INIT," 作用域ID: %u\n", ipv6->sin6_scope_id);
+ break;
+ }
+ default:
+ // 未知地址类型,打印原始信息
+ INFO(NCCL_INIT,"未知地址类型 (sa_family: %d)\n", addr->sa.sa_family);
+ //printf(" 原始数据长度: %d\n", addr->sa.sa_len);
+ break;
+ }
+}
+
+ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {//
ncclResult_t result = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
@@ -702,7 +835,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RECV]);
NCCLCHECK(ncclSocketInit(&sock));
NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
- NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer)));
+ NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer)));//很神奇,client连了server为什么还要反过来连
NCCLCHECK(ncclSocketClose(&sock));
NCCLCHECK(ncclSocketClose(&listenSockRoot));
BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RECV]);
@@ -710,9 +843,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
// accept and connect the ring network
if (ncclParamBootstrapNetEnable()) {
NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
- &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
- &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
+ &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+ &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
} else {
+ INFO(NCCL_INIT,"nextPeer.addr");
+ printNcclSocketAddress(&nextPeer.addr);
NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
}
@@ -759,17 +894,18 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
if (ncclParamRasEnable() == 1 && performRasAddRanks) {
if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
- INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+ INFO(NCCL_INIT|NCCL_RAS|NCCL_INIT, "Continuing in spite of a RAS initialization error");
}
+ INFO(NCCL_INIT,"bootstrapInitNew");
BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
- INFO(NCCL_BOOTSTRAP | NCCL_PROFILE, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
- timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
- timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
- timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
- timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
- timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
+ INFO(NCCL_BOOTSTRAP | NCCL_PROFILE | NCCL_INIT, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
+ timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
+ timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
+ timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
+ timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
+ timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
exit:
return result;
fail:
@@ -777,6 +913,140 @@ fail:
goto exit;
}
+
+ncclResult_t bootstrapInitNew(ncclComm_t comm,bool isNewRank) {
+ ncclResult_t result = ncclSuccess;
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ struct bootstrapState *state = (bootstrapState *)comm->bootstrap;
+ struct ncclSocket *proxySocket = NULL;
+ // NCCLCHECK(ncclCalloc(&state, 1));
+ state->rank = rank;
+ state->nranks = nranks;
+ state->cudaDev = comm->cudaDev;
+ state->abortFlag = comm->abortFlag;
+ state->net = comm->ncclNet;
+ bool performRasAddRanks = true;
+ // comm->bootstrap = state;
+ // comm->magic = state->magic = peerState->magic;
+ // newRankAddr = (union ncclSocketAddress *)malloc(sizeof(union ncclSocketAddress));
+ struct rasRankInit* rasRanks = nullptr;
+ // NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), newRankAddr, ncclSocketTypeBootstrap));
+
+ // Create the service proxy and get the UDS
+ // NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+ if(!isNewRank){
+ state->nranks = comm->nRanks;
+ INFO(NCCL_INIT,"bootstrapInitNew nranks: %d", state->nranks);
+ NCCLCHECK(ncclRealloc(&state->peerProxyAddresses, nranks-1, nranks));
+ NCCLCHECKGOTO(ncclRealloc(&state->peerProxyAddressesUDS, nranks-1, nranks), result, fail);
+ NCCLCHECKGOTO(ncclRealloc(&state->peerP2pAddresses, nranks-1, nranks), result, fail);
+ }else{
+ NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+ NCCLCHECK(ncclCalloc(&proxySocket, 1));
+ NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail);
+
+ NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail);
+ NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail);
+
+ // create a socket for others to reach out (P2P)
+ union ncclSocketAddress peerSocketAddress;
+ NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail);
+ NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail);
+ memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
+ }
+
+ // Initialize RAS
+ if (isNewRank){
+ if (ncclParamRasEnable() == 1) {
+ // The RAS thread will take care of freeing the memory allocated below.
+ NCCLCHECK(ncclCalloc(&rasRanks, nranks));
+ memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr));
+ rasRanks[rank].pid = getpid();
+ rasRanks[rank].cudaDev = comm->cudaDev;
+ rasRanks[rank].nvmlDev = comm->nvmlDev;
+ rasRanks[rank].hostHash = getHostHash();
+ rasRanks[rank].pidHash = getPidHash();
+ if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
+ INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+ // We should still participate in the ringAllInfo below as the peers will be waiting for us.
+ // Just make sure that the address is clearly invalid...
+ memset(rasRanks+rank, '\0', sizeof(*rasRanks));
+ performRasAddRanks = false;
+ }
+ }
+
+ }
+
+ NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, nullptr), result, fail);
+
+ // Create the service proxy and get the UDS
+ if (isNewRank){
+ NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+ if (ncclParamRasEnable() == 1 && performRasAddRanks) {
+ if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
+ INFO(NCCL_INIT|NCCL_RAS|NCCL_INIT, "Continuing in spite of a RAS initialization error");
+ }
+ }else{
+ comm->proxyState->listenSock->state = ncclSocketStateReady;
+ comm->proxyState->peerAddresses = state->peerProxyAddresses;
+ comm->proxyState->peerAddressesUDS = state->peerProxyAddressesUDS;
+ }
+ // NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+
+
+ // BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
+ INFO(NCCL_INIT, "rank %d nranks %d - bootstrap new DONE", rank, nranks);
+ // INFO(NCCL_BOOTSTRAP | NCCL_PROFILE | NCCL_INIT, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
+ // timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
+ // timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
+ // timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
+ // timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
+ // timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
+
+exit:
+ return result;
+fail:
+ if (proxySocket)
+ free(proxySocket);
+ goto exit;
+}
+
+void printBinaryData(const char* prefix, const void* data, size_t size) {
+ const unsigned char *bytes = (const unsigned char *)data;
+ char buffer[8192]; // 足够大的缓冲区
+ char *ptr = buffer;
+ size_t remaining = sizeof(buffer);
+
+ // 拼接新增的前缀字符串
+ if (prefix) {
+ int len = snprintf(ptr, remaining, "%s", prefix);
+ if (len < 0 || len >= remaining) return;
+ ptr += len;
+ remaining -= len;
+ }
+
+ // 拼接原有的"0x"前缀
+ int len = snprintf(ptr, remaining, "0x");
+ if (len < 0 || len >= remaining) return;
+ ptr += len;
+ remaining -= len;
+
+ // 拼接十六进制数据
+ for (size_t i = 0; i < size; i++) {
+ len = snprintf(ptr, remaining, "%02x", bytes[i]);
+ if (len < 0 || len >= remaining) return;
+ ptr += len;
+ remaining -= len;
+ }
+
+ // 拼接后缀
+ len = snprintf(ptr, remaining, " (size: %zu bytes)\n", size);
+ if (len < 0 || len >= remaining) return;
+
+ INFO(NCCL_INIT | NCCL_PROFILE, "%s", buffer);
+}
+
ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
@@ -786,6 +1056,8 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
union ringConnectInfo nextPeer;
struct ncclSocket* proxySocket = NULL;
struct bootstrapState* state;
+ struct LhTxn* lhTxn = NULL;
+ struct LhState* lhState = NULL;
NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
state->rank = rank;
@@ -820,14 +1092,48 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
// Get addr from next rank using the parent's connections
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
+
+ if (rank == 0 || rank == nranks - 1) {
+ if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnBegin failed");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ if (txnLoad(lhTxn, &lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnLoad failed");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ if (rank == 0) {
+ setFirstRank(lhState, &info.addr, rank, nranks);
+ setMagic(lhState, magic);
+ }
+ else {
+ setLastRank(lhState, &info.addr, rank, nranks);
+ }
+ updateVersion(lhState);
+ printLhState(lhState);
+ if (txnSave(lhTxn, lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnSave failed");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ if (txnEnd(lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnEnd failed");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ }
+
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
- &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
- &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
+ &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+ &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
ret, fail);
} else {
NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
}
+
NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
@@ -855,6 +1161,10 @@ exit:
return ret;
fail:
free(proxySocket);
+ if (lhTxn)
+ free(lhTxn);
+ if (lhState)
+ free(lhState);
goto exit;
}
@@ -987,9 +1297,9 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC
NCCLCHECKGOTO(netReg(net, sendComm, data, nranks * size, &sendDataHandle), res, exit);
NCCLCHECKGOTO(netReg(net, recvComm, data, nranks * size, &recvDataHandle), res, exit);
/* Simple ring based AllGather
- * At each step i receive data from (rank-i-1) from prev
- * and send previous step's data from (rank-i) to next
- */
+ * At each step i receive data from (rank-i-1) from prev
+ * and send previous step's data from (rank-i) to next
+ */
TRACE(NCCL_BOOTSTRAP, "NetRingAllGather started");
BOOTSTRAP_PROF_OPEN(tFirst);
for (int i = 0; i < nranks - 1; i++) {
@@ -1016,9 +1326,9 @@ static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct nccl
ncclResult_t res = ncclSuccess;
uint64_t tFirst = 0, tRest = 0;
/* Simple ring based AllGather
- * At each step i receive data from (rank-i-1) from prev
- * and send previous step's data from (rank-i) to next
- */
+ * At each step i receive data from (rank-i-1) from prev
+ * and send previous step's data from (rank-i) to next
+ */
TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started");
BOOTSTRAP_PROF_OPEN(tFirst);
for (int i = 0; i < nranks - 1; i++) {
@@ -1063,10 +1373,10 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
if (nranks == 1)
return ncclSuccess;
/* Simple [intra] process barrier
- *
- * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
- * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
- */
+ *
+ * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
+ * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
+ */
int data[1] = {0};
for (int mask = 1; mask < nranks; mask <<= 1) {
int src = (rank - mask + nranks) % nranks;
@@ -1185,3 +1495,4 @@ ncclResult_t bootstrapAbort(void* commState) {
NCCLCHECK(bootstrapClose(commState));
return ncclSuccess;
}
+
@@ -1,8 +1,8 @@
/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#include "channel.h"
#include "param.h"
@@ -62,6 +62,80 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
return ncclSuccess;
}
+ncclResult_t initChannelNew(struct ncclComm* comm, int channelId) {
+ struct ncclChannel* channel = &comm->channels[channelId];
+ INFO(NCCL_INIT, "start channel:channelId %d channel %p", channelId, channel);
+ channel->id = -1;
+ if (channel->id != -1) return ncclSuccess;
+
+ int nRanks = comm->nRanks;
+ int nvlsRanks = comm->localRanks;
+ INFO(NCCL_INIT, "initChannel comm %p channelId %d nRanks %d nvlsRanks %d", comm, channelId, nRanks, nvlsRanks);
+ int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */ ;
+ channel->id = channelId;
+ channel->workFifoProduced = 0;
+
+ struct ncclSharedResources* sharedRes = comm->sharedRes;
+ //sharedRes->deviceStream = NULL;
+ cudaStream_t deviceStream;
+ sharedRes->tpNRanks = comm->nRanks;
+ NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+ sharedRes->peers[channelId] = NULL;
+ sharedRes->devPeers[channelId] = NULL;
+ channel->peers = NULL;
+ //sharedRes->peers[channelId] == NULL;
+ //if (channel->peers == NULL) {
+ // The extra on nRanks+1 is for collnet root (i.e. network)
+ // Allocate everything related to sharedRes with ncclCalloc as this can be
+ // shared between communicators hence should not be tied to comm.
+ //if (sharedRes->peers[channelId] == NULL) {
+ INFO(NCCL_INIT,"sharedRes->peers[channelId]");
+ NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks));
+ //}
+ // ???????
+ channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer*>(&comm->memPermanent, nPeers);
+ for (int r = 0; r < nRanks ; r++) {
+ channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r];
+ INFO(NCCL_INIT, "initChannel comm %p channelId %d rank %d peer %p", comm, channelId, r, channel->peers[r]);
+ INFO(NCCL_INIT,"comm->topParentRanks[r] %d", comm->topParentRanks[r]);
+ INFO(NCCL_INIT,"ncclAtomicRefCountIncrement : %d",channel->peers[r]->refCount);
+ (channel->peers[r]->send + 0)->transportResources = NULL;
+ (channel->peers[r]->recv + 0)->transportResources = NULL;
+ ncclAtomicRefCountIncrement(&channel->peers[r]->refCount);
+ }
+ //}
+ channel->devPeers = NULL;
+ channel->devPeersHostPtr = NULL;
+ //sharedRes->devPeers[channelId] == NULL;
+ //if (channel->devPeers == NULL) {
+ //if (sharedRes->devPeers[channelId] == NULL) {
+ INFO(NCCL_INIT,"sharedRes->devpeers[channelId]");
+ NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
+ //}
+ /* channel->devPeers is not shared, so just free it when calling commFree() */
+ NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
+ ncclCommPushCudaFree(comm, channel->devPeers);
+ NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
+ for (int r = 0; r < nRanks ; r++) {
+ INFO(NCCL_INIT,"devhuojian");
+ uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+ NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
+ channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
+ }
+ //}
+
+ channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks );
+ NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks , deviceStream));
+ ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
+ /* guarantee addr has been copied into channel->devPeers */
+ NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
+ NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+ // INFO(NCCL_INIT,"shenghli %d",channelId);
+ return ncclSuccess;
+}
+
+
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
struct ncclChannel* channel = &comm->channels[channelId];
struct ncclSharedResources* sharedRes = comm->sharedRes;
@@ -147,8 +221,8 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
int nPeers = nRanks + collnetNRanks + nvlsNRanks;
/* channel peers are only valid when async init thread completes commAlloc() and
- * the channel is initialized with initChannel(); if either is not done, this channel
- * should never be free. */
+ * the channel is initialized with initChannel(); if either is not done, this channel
+ * should never be free. */
if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
// Free transport proxy resources
@@ -4,515 +4,522 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "comm.h"
-#include "device.h"
-#include "graph.h"
-#include "transport.h"
-#include "trees.h"
-#include "rings.h"
-#include "topo.h"
-
-/******************************************************************/
-/********************* Internode connection ***********************/
-/******************************************************************/
-
-ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
- int rank = comm->rank;
- int localRanks = comm->topo->nodes[GPU].count;
- int nChannels = comm->nChannels;
-
- topoRanks->nvlsHeadNum = 0;
- for (int c=0; c<nChannels; c++) {
- struct ncclChannel* channel = comm->channels+c;
- channel->ring.prev = channel->ring.next = -1;
- channel->tree.up = -1;
- channel->collnetChain.up = -1;
- for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
- for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
- channel->collnetDirect.out = -1;
- channel->collnetDirect.headRank = -1;
- channel->collnetDirect.nHeads = 0;
- channel->collnetDirect.shift = 0;
- for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
- for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
- for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
-
- int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
- int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
- int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
-
- for (int i=0; i<localRanks; i++) {
- if (ringIntra[i] == rank) {
- topoRanks->ringRecv[c] = ringIntra[0];
- topoRanks->ringSend[c] = ringIntra[localRanks-1];
- topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
- topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
- }
- if (treeIntra[i] == rank) {
- int parentIndex = 0;
- int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
- int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
-
- topoRanks->treeToParent[c] = treeIntra[parentIndex];
- topoRanks->treeToChild0[c] = treeIntra[child0Index];
- topoRanks->treeToChild1[c] = treeIntra[child1Index];
- channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
- channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
- }
- if (collNetIntra[i] == rank) {
- channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
- channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
- }
- }
- }
- // Duplicate channels trees
- struct ncclChannel* channel0 = comm->channels;
- struct ncclChannel* channel1 = channel0+nChannels;
- memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
-
- // Get nvls heads and the number of heads. Duplicate head is not allowed.
- for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
- bool addHead = true;
- int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
-
- for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
- if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
- addHead = false;
- break;
- }
- }
- if (addHead) {
- topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
- }
- }
- memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
-
- return ncclSuccess;
-}
-
-static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
- int nChannels = comm->nChannels;
- int nNodes = comm->nNodes;
- for (int c=0; c<nChannels; c++) {
- int* recv = ringRecv+c*comm->nNodes;
- int* send = ringSend+c*comm->nNodes;
- int* prev = ringPrev+c*comm->nRanks;
- int* next = ringNext+c*comm->nRanks;
- for (int n=0; n<nNodes; n++) {
- int recvRank = recv[n];
- int prevSendRank = send[(n-1+nNodes)%nNodes];
- prev[recvRank] = prevSendRank;
- int sendRank = send[n];
- int nextRecvRank = recv[(n+1)%nNodes];
- next[sendRank] = nextRecvRank;
- }
- }
- return ncclSuccess;
-}
-
-static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
- for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
- return ncclSuccess;
-}
-
-static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
- if (u == -1) return ncclSuccess;
- tree->up = indexes[u];
- return ncclSuccess;
-}
-
-static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
- if (d == -1) return ncclSuccess;
- int x = 0;
- while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
- if (x == NCCL_MAX_TREE_ARITY) {
- WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
- return ncclInternalError;
- }
- tree->down[x] = indexes[d];
- return ncclSuccess;
-}
-
-static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
- const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-
- // Compute tree depth. Not an exact value but a good approximation in most
- // cases
- int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
-
- int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
- int* ttp, *ttc0, *ttc1;
- NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
- for (int c=0; c<nChannels; c++) {
- struct ncclChannel* channel0 = comm->channels+c;
- struct ncclChannel* channel1 = channel0+nChannels;
- ttp = treeToParent+c*comm->nNodes;
- ttc0 = treeToChild0+c*comm->nNodes;
- ttc1 = treeToChild1+c*comm->nNodes;
- if (comm->rank == ttp[node]) {
- NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
- NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
+ #include "comm.h"
+ #include "device.h"
+ #include "graph.h"
+ #include "transport.h"
+ #include "trees.h"
+ #include "rings.h"
+ #include "topo.h"
+
+ /******************************************************************/
+ /********************* Internode connection ***********************/
+ /******************************************************************/
+
+ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
+ int rank = comm->rank;
+ int localRanks = comm->topo->nodes[GPU].count;
+ int nChannels = comm->nChannels;
+
+ topoRanks->nvlsHeadNum = 0;
+ for (int c=0; c<nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ channel->ring.prev = channel->ring.next = -1;
+ channel->tree.up = -1;
+ channel->collnetChain.up = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
+ channel->collnetDirect.out = -1;
+ channel->collnetDirect.headRank = -1;
+ channel->collnetDirect.nHeads = 0;
+ channel->collnetDirect.shift = 0;
+ for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
+ for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
+ for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
+
+ int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
+ int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
+ int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
+
+ for (int i=0; i<localRanks; i++) {
+ if (ringIntra[i] == rank) {
+ topoRanks->ringRecv[c] = ringIntra[0];
+ topoRanks->ringSend[c] = ringIntra[localRanks-1];
+ topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
+ topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
+ }
+ if (treeIntra[i] == rank) {
+ int parentIndex = 0;
+ int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+ int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
+
+ topoRanks->treeToParent[c] = treeIntra[parentIndex];
+ topoRanks->treeToChild0[c] = treeIntra[child0Index];
+ topoRanks->treeToChild1[c] = treeIntra[child1Index];
+ channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
+ channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
+ }
+ if (collNetIntra[i] == rank) {
+ channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
+ channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
+ }
}
- if (comm->rank == ttc0[node]) {
- NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
- NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
+ }
+ // Duplicate channels trees
+ struct ncclChannel* channel0 = comm->channels;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+
+ // Get nvls heads and the number of heads. Duplicate head is not allowed.
+ for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
+ bool addHead = true;
+ int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
+
+ for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
+ if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
+ addHead = false;
+ break;
+ }
}
- if (comm->rank == ttc1[node]) {
- NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
- NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
+ if (addHead) {
+ topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
}
- if (comm->rank == ttp[node] ||
- comm->rank == ttc0[node] ||
- comm->rank == ttc1[node]) {
- INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
- INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
+ }
+ memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
+
+ return ncclSuccess;
+ }
+
+ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
+ int nChannels = comm->nChannels;
+ int nNodes = comm->nNodes;
+ for (int c=0; c<nChannels; c++) {
+ int* recv = ringRecv+c*comm->nNodes;
+ int* send = ringSend+c*comm->nNodes;
+ int* prev = ringPrev+c*comm->nRanks;
+ int* next = ringNext+c*comm->nRanks;
+ for (int n=0; n<nNodes; n++) {
+ int recvRank = recv[n];
+ int prevSendRank = send[(n-1+nNodes)%nNodes];
+ prev[recvRank] = prevSendRank;
+ int sendRank = send[n];
+ int nextRecvRank = recv[(n+1)%nNodes];
+ next[sendRank] = nextRecvRank;
}
- channel0->tree.depth = channel1->tree.depth = depth;
- }
+ }
+ return ncclSuccess;
+ }
+
+ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
+ for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
return ncclSuccess;
-}
-
-static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
- int rank = comm->rank;
- int localRanks = comm->localRanks;
- int nHeads = 0;
- int *heads;
- NCCLCHECK(ncclCalloc(&heads, localRanks));
- // Find all head ranks
- // Head index is always 0
- for (int c=0; c<collNetGraph->nChannels; c++) {
- int* collNetIntra = collNetGraph->intra+c*localRanks;
- int head = collNetIntra[0];
- for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
- if (head != -1) heads[nHeads++] = collNetIntra[0];
- }
- // For all channels
- for (int c=0; c<comm->nChannels; c++) {
- struct ncclChannel* channel = comm->channels+c;
- char line[1024];
- sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
- int nDown = 0;
- for (int i=0; i<nHeads; i++) {
- if (rank == heads[i]) { // is head
- channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
- channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
- int* collNetIntra = collNetGraph->intra+i*localRanks;
- sprintf(line+strlen(line), "down ");
- for (int r=0; r<localRanks; r++) {
- if (collNetIntra[r] == rank) continue;
- channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
- sprintf(line+strlen(line), " %d ", collNetIntra[r]);
- }
- sprintf(line+strlen(line), "nDown %d ", nDown);
- break;
+ }
+
+ static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
+ if (u == -1) return ncclSuccess;
+ tree->up = indexes[u];
+ return ncclSuccess;
+ }
+
+ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
+ if (d == -1) return ncclSuccess;
+ int x = 0;
+ while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
+ if (x == NCCL_MAX_TREE_ARITY) {
+ WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
+ return ncclInternalError;
+ }
+ tree->down[x] = indexes[d];
+ return ncclSuccess;
+ }
+
+ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
+ const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+
+ // Compute tree depth. Not an exact value but a good approximation in most
+ // cases
+ int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+
+ int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+ int* ttp, *ttc0, *ttc1;
+ NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
+ for (int c=0; c<nChannels; c++) {
+ struct ncclChannel* channel0 = comm->channels+c;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ ttp = treeToParent+c*comm->nNodes;
+ ttc0 = treeToChild0+c*comm->nNodes;
+ ttc1 = treeToChild1+c*comm->nNodes;
+ if (comm->rank == ttp[node]) {
+ NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
+ NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
- }
- // Connect to all heads
- int nUp = 0;
- sprintf(line+strlen(line), "up ");
- for (int h=0; h<nHeads; h++) {
- if (rank == heads[h]) continue;
- channel->collnetDirect.up[nUp++] = heads[h];
- sprintf(line+strlen(line), " %d ", heads[h]);
- }
- sprintf(line+strlen(line), "heads ");
- { // heads[] is the list of heads ordered in head order startubg with self
- int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
- for (int h1=0; h1 < nHeads; h1++) {
- int h = (h0+h1)%nHeads;
- channel->collnetDirect.heads[h1] = heads[h];
- sprintf(line+strlen(line), " %d ", heads[h]);
+ if (comm->rank == ttc0[node]) {
+ NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
+ NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
- }
- channel->collnetDirect.nHeads = nHeads;
- // nHeads should always be greater than 0.
- // coverity[divide_by_zero]
- channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
- channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
- sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
- sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
- INFO(NCCL_GRAPH, "%s", line);
- channel->collnetChain.depth = comm->nRanks/comm->nNodes;
- }
- free(heads);
- return ncclSuccess;
-}
-
-static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
- int headRank = -1;
- if (nHeads == 0) {
- comm->nvlsChannels = 0;
- return ncclSuccess;
- }
-
- for (int h = 0; h < nHeads; h++) {
- if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
- }
-
- for (int c=0; c<comm->nChannels; c++) {
- struct ncclChannel* channel = comm->channels+c;
- channel->nvls.nHeads = nHeads;
- for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
- for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
- channel->nvls.down = comm->nRanks+1+headRank;
- channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
- channel->nvls.headRank = headRank;
- channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
- if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
- }
- if (comm->nNodes == 1) return ncclSuccess;
-
- // Connect Trees
- int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
- int pc0, pc1; // ignored
- NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
- &tree0Parent, &tree0Child0, &tree0Child1, &pc0,
- &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
-
- int* heads = NULL;
- int treeUp[2] = { -1, -1 };
- int treeDown0[2] = { -1, -1 };
- int treeDown1[2] = { -1, -1 };
-
- if (comm->node == 0) {
- for (int h=0; h<nHeads; h++) {
- char line[1024];
- sprintf(line, "NVLS Head %2d:", h);
- heads = nvlsHeads+h*comm->nNodes;
- for (int n=0; n<comm->nNodes && n<20; n++) {
- sprintf(line+strlen(line), " %2d", heads[n]);
+ if (comm->rank == ttc1[node]) {
+ NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
+ NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
- INFO(NCCL_INIT, "%s", line);
- }
- }
-
- // Find the heads where I'm the head rank and retain tree up/down
- for (int h=0; h<nHeads; h++) {
- heads = nvlsHeads+h*comm->nNodes;
- if (heads[comm->node] == comm->rank) {
- treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
- treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
- treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
- treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
- treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
- treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
- break;
- }
- }
- // Set prev/next in all channels (NVLS compute channels work
- // orthogonally to NVLS search channels).
- for (int c=0; c<comm->nChannels; c++) {
- struct ncclChannel* channel = comm->channels+c;
- channel->nvls.treeUp = treeUp[c%2];
- channel->nvls.treeDown[0] = channel->nvls.down;
- int ix = 1;
- if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
- if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
- }
-
- struct ncclNvls* nvls0 = &comm->channels[0].nvls;
- struct ncclNvls* nvls1 = &comm->channels[1].nvls;
- INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
- nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
- nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
- return ncclSuccess;
-}
-
-// Legacy naming
-NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
-NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
-// New naming
-NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
-NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
-
-int ncclMinNchannels() {
- int minNchannels = 0;
- if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
- if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
- if (minNchannels > MAXCHANNELS) {
- INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
- minNchannels = MAXCHANNELS;
- }
- if (minNchannels < 0) minNchannels = 0;
- return minNchannels;
-}
-
-extern int64_t ncclParamWorkArgsBytes();
-
-int ncclMaxNchannels() {
- int maxNchannels = MAXCHANNELS;
- if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
- if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
- maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
- if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
- if (maxNchannels < 1) {
- INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
- maxNchannels = 1;
- }
- return maxNchannels;
-}
-
-static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
- int nranks = comm->nRanks;
- int c;
- for (c=start; c<end; c++) {
- memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
- memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
- memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
- }
- return c;
-}
-
-void exchangeValues(int* v0, int* v1) {
- int tmp = *v1;
- *v1 = *v0;
- *v0 = tmp;
-}
-
-NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
-
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
- // Gather data from all ranks
- ncclResult_t ret = ncclSuccess;
- int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
- int nranks = comm->nRanks;
- int nNodes = comm->nNodes;
- int nChannels = comm->nChannels;
- int minHeadNum = INT_MAX;
- int shared = parent && parent->nvlsSupport && parent->shareResources;
- NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
- NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
-
- // Alternate rings to avoid crossing rails
- if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
- for (int r=0; r<comm->nRanks; r++) {
- if (comm->rankToNode[r] % 2 == 1) {
- // Exchange rings
- for (int c=0; c<nChannels; c+=2) {
- exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
- exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
- exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
- exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
- }
+ if (comm->rank == ttp[node] ||
+ comm->rank == ttc0[node] ||
+ comm->rank == ttc1[node]) {
+ INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
+ INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
- }
- }
-
- for (int c=0; c<nChannels;c++) {
- for (int n=0; n<nNodes; n++) {
- int r = firstRanks[n];
- ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
- ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
- treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
- treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
- treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
- }
- for (int r=0; r<nranks; r++) {
- ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
- ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
- }
- }
-
- for (int n = 0; n < nNodes; n++) {
- int r = firstRanks[n];
- if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
- minHeadNum = allTopoRanks[r]->nvlsHeadNum;
- }
-
- for (int c = 0; c < minHeadNum; c++) {
- for (int n = 0; n < nNodes; n++) {
- int r = firstRanks[n];
- nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
- }
- }
-
- // Connect rings and trees. This should also duplicate the channels.
- NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
- NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
-
- // Duplicate ringPrev/ringNext for ncclBuildRing
- memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
- memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
-
- // Set ring prev/next for my rank
- for (int c=0; c<nChannels; c++) {
- struct ncclChannel* channel0 = comm->channels+c;
- struct ncclChannel* channel1 = channel0+nChannels;
- channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
- channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
- }
-
- // Duplication should be complete now
- nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
-
- // Setup CollNet
- if (comm->config.collnetEnable) {
- struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
- // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
- if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
- int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
- nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
- }
- NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
- }
-
- // Use 4 compute channels per search channel to reach peak BW on <8 PPN
- if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
- nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
- }
-
- // Double the number of channels when using unpack networking (greater than 1 node)
- // We won't automatically double past 16 channels, users can specify 32 if they want
- if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
- nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
- }
-
- // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
- // We permit combining max, then min, to only use the first channels, then duplicate them.
- if (comm->sharedRes->owner != comm) {
- /* child comm #channels cannot exceed top parent #channels. */
- nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
- nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
- } else {
- nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
- nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
- }
-
- comm->collChannels = comm->nChannels;
-#if CUDART_VERSION >= 12010
- // Support maximal channel usage for aggregation
- if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
- comm->nvlsChannels = parent->nvlsResources->nChannels;
- }
- if (comm->nChannels < comm->nvlsChannels) {
- nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
- }
- NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
-#endif
- if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
- nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
- comm->collChannels = std::min(comm->collChannels, comm->nChannels);
- }
-
- // Create rings array and check all is fine
- NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
-
-exit:
- if (ringRecv) free(ringRecv);
- if (ringSend) free(ringSend);
- if (ringPrev) free(ringPrev);
- if (ringNext) free(ringNext);
- if (treeToParent) free(treeToParent);
- if (treeToChild0) free(treeToChild0);
- if (treeToChild1) free(treeToChild1);
- if (nvlsHeads) free(nvlsHeads);
- return ret;
-fail:
- goto exit;
-}
+ channel0->tree.depth = channel1->tree.depth = depth;
+ }
+ return ncclSuccess;
+ }
+
+ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
+ int rank = comm->rank;
+ int localRanks = comm->localRanks;
+ int nHeads = 0;
+ int *heads;
+ NCCLCHECK(ncclCalloc(&heads, localRanks));
+ // Find all head ranks
+ // Head index is always 0
+ for (int c=0; c<collNetGraph->nChannels; c++) {
+ int* collNetIntra = collNetGraph->intra+c*localRanks;
+ int head = collNetIntra[0];
+ for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
+ if (head != -1) heads[nHeads++] = collNetIntra[0];
+ }
+ // For all channels
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ char line[1024];
+ sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
+ int nDown = 0;
+ for (int i=0; i<nHeads; i++) {
+ if (rank == heads[i]) { // is head
+ channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
+ channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
+ int* collNetIntra = collNetGraph->intra+i*localRanks;
+ sprintf(line+strlen(line), "down ");
+ for (int r=0; r<localRanks; r++) {
+ if (collNetIntra[r] == rank) continue;
+ channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
+ sprintf(line+strlen(line), " %d ", collNetIntra[r]);
+ }
+ sprintf(line+strlen(line), "nDown %d ", nDown);
+ break;
+ }
+ }
+ // Connect to all heads
+ int nUp = 0;
+ sprintf(line+strlen(line), "up ");
+ for (int h=0; h<nHeads; h++) {
+ if (rank == heads[h]) continue;
+ channel->collnetDirect.up[nUp++] = heads[h];
+ sprintf(line+strlen(line), " %d ", heads[h]);
+ }
+ sprintf(line+strlen(line), "heads ");
+ { // heads[] is the list of heads ordered in head order startubg with self
+ int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
+ for (int h1=0; h1 < nHeads; h1++) {
+ int h = (h0+h1)%nHeads;
+ channel->collnetDirect.heads[h1] = heads[h];
+ sprintf(line+strlen(line), " %d ", heads[h]);
+ }
+ }
+ channel->collnetDirect.nHeads = nHeads;
+ // nHeads should always be greater than 0.
+ // coverity[divide_by_zero]
+ channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
+ channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
+ sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
+ sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
+ INFO(NCCL_GRAPH, "%s", line);
+ channel->collnetChain.depth = comm->nRanks/comm->nNodes;
+ }
+ free(heads);
+ return ncclSuccess;
+ }
+
+ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
+ int headRank = -1;
+ if (nHeads == 0) {
+ comm->nvlsChannels = 0;
+ return ncclSuccess;
+ }
+
+ for (int h = 0; h < nHeads; h++) {
+ if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
+ }
+
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ channel->nvls.nHeads = nHeads;
+ for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
+ for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
+ channel->nvls.down = comm->nRanks+1+headRank;
+ channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
+ channel->nvls.headRank = headRank;
+ channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
+ if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
+ }
+ if (comm->nNodes == 1) return ncclSuccess;
+
+ // Connect Trees
+ int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
+ int pc0, pc1; // ignored
+ NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
+ &tree0Parent, &tree0Child0, &tree0Child1, &pc0,
+ &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
+
+ int* heads = NULL;
+ int treeUp[2] = { -1, -1 };
+ int treeDown0[2] = { -1, -1 };
+ int treeDown1[2] = { -1, -1 };
+
+ if (comm->node == 0) {
+ for (int h=0; h<nHeads; h++) {
+ char line[1024];
+ sprintf(line, "NVLS Head %2d:", h);
+ heads = nvlsHeads+h*comm->nNodes;
+ for (int n=0; n<comm->nNodes && n<20; n++) {
+ sprintf(line+strlen(line), " %2d", heads[n]);
+ }
+ INFO(NCCL_INIT, "%s", line);
+ }
+ }
+
+ // Find the heads where I'm the head rank and retain tree up/down
+ for (int h=0; h<nHeads; h++) {
+ heads = nvlsHeads+h*comm->nNodes;
+ if (heads[comm->node] == comm->rank) {
+ treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
+ treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
+ treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
+ treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
+ treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
+ treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
+ break;
+ }
+ }
+ // Set prev/next in all channels (NVLS compute channels work
+ // orthogonally to NVLS search channels).
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ channel->nvls.treeUp = treeUp[c%2];
+ channel->nvls.treeDown[0] = channel->nvls.down;
+ int ix = 1;
+ if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
+ if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
+ }
+
+ struct ncclNvls* nvls0 = &comm->channels[0].nvls;
+ struct ncclNvls* nvls1 = &comm->channels[1].nvls;
+ INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
+ nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
+ nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
+ return ncclSuccess;
+ }
+
+ // Legacy naming
+ NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+ NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+ // New naming
+ NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
+ NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+
+ int ncclMinNchannels() {
+ int minNchannels = 0;
+ if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
+ if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
+ if (minNchannels > MAXCHANNELS) {
+ INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
+ minNchannels = MAXCHANNELS;
+ }
+ if (minNchannels < 0) minNchannels = 0;
+ return minNchannels;
+ }
+
+ extern int64_t ncclParamWorkArgsBytes();
+
+ int ncclMaxNchannels() {
+ int maxNchannels = MAXCHANNELS;
+ if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
+ if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+ maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
+ if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
+ if (maxNchannels < 1) {
+ INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
+ maxNchannels = 1;
+ }
+ return maxNchannels;
+ }
+
+ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
+ int nranks = comm->nRanks;
+ int c;
+ for (c=start; c<end; c++) {
+ memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
+ memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
+ memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
+ }
+ return c;
+ }
+
+ void exchangeValues(int* v0, int* v1) {
+ int tmp = *v1;
+ *v1 = *v0;
+ *v0 = tmp;
+ }
+
+ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
+
+ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
+ // Gather data from all ranks
+ ncclResult_t ret = ncclSuccess;
+ int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
+ int nranks = comm->nRanks;
+ int nNodes = comm->nNodes;
+ int nChannels = comm->nChannels;
+ int minHeadNum = INT_MAX;
+ int shared = parent && parent->nvlsSupport && parent->shareResources;
+ NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
+ NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
+
+ // Alternate rings to avoid crossing rails
+ if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
+ for (int r=0; r<comm->nRanks; r++) {
+ if (comm->rankToNode[r] % 2 == 1) {
+ // Exchange rings
+ for (int c=0; c<nChannels; c+=2) {
+ exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
+ exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
+ exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
+ exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
+ }
+ }
+ }
+ }
+
+ for (int c=0; c<nChannels;c++) {
+ for (int n=0; n<nNodes; n++) {
+ int r = firstRanks[n];
+ ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
+ ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
+ treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
+ treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
+ treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
+ }
+ for (int r=0; r<nranks; r++) {
+ ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
+ ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
+ }
+ }
+
+ for (int n = 0; n < nNodes; n++) {
+ int r = firstRanks[n];
+ if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
+ minHeadNum = allTopoRanks[r]->nvlsHeadNum;
+ }
+
+ for (int c = 0; c < minHeadNum; c++) {
+ for (int n = 0; n < nNodes; n++) {
+ int r = firstRanks[n];
+ nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
+ }
+ }
+
+ // Connect rings and trees. This should also duplicate the channels.
+ NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
+ NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
+
+ // Duplicate ringPrev/ringNext for ncclBuildRing
+ memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
+ memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
+
+ // Set ring prev/next for my rank
+ for (int c=0; c<nChannels; c++) {
+ struct ncclChannel* channel0 = comm->channels+c;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
+ channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
+ }
+
+ // Duplication should be complete now
+ nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+
+ // Setup CollNet
+ if (comm->config.collnetEnable) {
+ struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
+ // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
+ if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
+ int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
+ nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
+ }
+ NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+ }
+
+ // Use 4 compute channels per search channel to reach peak BW on <8 PPN
+ if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
+ nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+ }
+
+ // Double the number of channels when using unpack networking (greater than 1 node)
+ // We won't automatically double past 16 channels, users can specify 32 if they want
+ if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+ nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+ }
+
+ // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
+ // We permit combining max, then min, to only use the first channels, then duplicate them.
+ if (comm->sharedRes->owner != comm) {
+ /* child comm #channels cannot exceed top parent #channels. */
+ nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
+ nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
+ } else {
+ nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
+ nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
+ }
+
+ comm->collChannels = comm->nChannels;
+ #if CUDART_VERSION >= 12010
+ // Support maximal channel usage for aggregation
+ if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
+ comm->nvlsChannels = parent->nvlsResources->nChannels;
+ }
+ if (comm->nChannels < comm->nvlsChannels) {
+ nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
+ }
+ NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
+ #endif
+ if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
+ nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
+ comm->collChannels = std::min(comm->collChannels, comm->nChannels);
+ }
+ NCCLCHECKGOTO(ncclCalloc(&comm->ringPrev, comm->nRanks * MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->ringNext, comm->nRanks * MAXCHANNELS), ret, fail);
+ for (int j = 0; j < nranks * MAXCHANNELS; j++) {
+ comm->ringPrev[j] = ringPrev[j];
+ comm->ringNext[j] = ringNext[j];
+ //INFO(NCCL_INIT,"j:%d,ringPrev:%d,ringNext:%d",j,ringPrev[j],ringNext[j]);
+ }
+ // Create rings array and check all is fine
+ NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
+
+ exit:
+ if (ringRecv) free(ringRecv);
+ if (ringSend) free(ringSend);
+ if (ringPrev) free(ringPrev);
+ if (ringNext) free(ringNext);
+ if (treeToParent) free(treeToParent);
+ if (treeToChild0) free(treeToChild0);
+ if (treeToChild1) free(treeToChild1);
+ if (nvlsHeads) free(nvlsHeads);
+ return ret;
+ fail:
+ goto exit;
+ }
+
\ No newline at end of file
@@ -1,8 +1,8 @@
/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#ifndef NCCL_BOOTSTRAP_H_
#define NCCL_BOOTSTRAP_H_
@@ -16,10 +16,69 @@ struct ncclBootstrapHandle {
};
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
+// extern union ncclSocketAddress bootstrapNetIfAddr;
+
+struct unexConn {
+ int peer;
+ int tag;
+ struct ncclSocket sock;
+ struct unexConn *next;
+};
+
+struct bootstrapRing_t {
+ union {
+ struct {
+ void *sendComm, *recvComm;
+ ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
+ } net;
+ struct {
+ struct ncclSocket recv;
+ struct ncclSocket send;
+ } socket;
+ };
+};
+struct bootstrapListen_t {
+ struct ncclSocket peerSocket; // socket for peers to contact me in P2P
+ union {
+ struct {
+ int dev;
+ void *comm;
+ char handle[NCCL_NET_HANDLE_MAXSIZE];
+ } net;
+ struct ncclSocket socket; // socket to be used for the ring
+ };
+};
+
+struct bootstrapState {
+ struct bootstrapRing_t ring;
+ struct bootstrapListen_t listen;
+ ncclNet_t *net;
+ uint64_t *peerProxyAddressesUDS;
+ union ncclSocketAddress *peerProxyAddresses;
+ union ncclSocketAddress *peerP2pAddresses;
+ struct unexConn *unexpectedConnections;
+ int cudaDev;
+ int rank;
+ int nranks;
+ uint64_t magic;
+ volatile uint32_t *abortFlag;
+};
+
+// typedef struct {
+// int socket_fd;
+// int rank_id;
+// char ip_address[INET_ADDRSTRLEN];
+// } RankConnection;
+#define STATE_RING(s, f) (s->ring.f)
+#define STATE_LISTEN(s, f) (s->listen.f)
+ncclResult_t socketRingConnectPrev(ncclSocketAddress* addr, struct ncclSocket* sendSocket, uint64_t magic, volatile uint32_t* abortFlag);
+ncclResult_t socketRingConnectNext(struct ncclSocket* listenSock, struct ncclSocket* recvSocket);
+ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag);
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
+ncclResult_t bootstrapInitNew(ncclComm_t comm,bool isNewRank);
ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
@@ -31,4 +90,11 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
+ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
+ ncclSocketType type);
+
+
+void printBinaryData(const char* prefix, const void* data, size_t size);
+
+
#endif
@@ -12,6 +12,7 @@
#include <algorithm>
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t initChannelNew(struct ncclComm* comm, int channelid);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
@@ -1,8 +1,8 @@
/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
@@ -417,8 +417,44 @@ typedef enum ncclGroupTaskType {
ncclGroupTaskTypeNum = 2,
} ncclGroupTaskType_t;
+struct ncclCommTrans {
+ struct ncclTopoRanks* peerTopo;//长度nRanks
+ struct ncclPeerInfo* peerInfo;
+ int* nodesFirstRank;//长度nRanks
+ int* nodesTreePatterns;//长度nRanks
+ int* ringPrev;//长度nRanks*MAXCHANNELS
+ int* ringNext;//长度nRanks*MAXCHANNELS
+ int* peerRings;//长度nRanks*MAXCHANNELS
+ void* bootstrap;
+ int nRanks; // number of GPUs in communicator
+ int cudaDev;
+ int cpuArch;
+ int cpuVendor;
+ int nNodes;
+ uint64_t commHash;
+ int* rankToNode;
+ struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+};
+
struct ncclComm {
uint64_t startMagic;
+ // struct ncclTopoRanks peerTopo[8];
+ // int nodesFirstRank[16];
+ // int nodesTreePatterns[16];
+ // int ringPrev[9 * MAXCHANNELS];
+ // int ringNext[9 * MAXCHANNELS];
+ // //char peersXml[TOTAL_SIZE];
+ // int peerRings[9 * MAXCHANNELS];
+ struct ncclTopoRanks* peerTopo;//长度nRanks
+ int* nodesFirstRank;//长度nRanks
+ int* nodesTreePatterns;//长度nRanks
+ int* ringPrev;//长度nRanks*MAXCHANNELS
+ int* ringNext;//长度nRanks*MAXCHANNELS
+ int* peerRings;//长度nRanks*MAXCHANNELS
+ // void* bootstrap;
+ // int nRanks; // number of GPUs in communicator
+ // int cudaDev;
+ // int* rankToNode;
struct ncclMemoryStack memPermanent, memScoped;
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
@@ -515,7 +551,7 @@ struct ncclComm {
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
/* This attribute can indicate the states of communicators and return code of
- * asynchronous NCCL operations. */
+ * asynchronous NCCL operations. */
ncclResult_t asyncResult;
// Flag to ask NCCL kernels to abort
new file mode 100644
@@ -0,0 +1,34 @@
+#ifndef LIGHTHOUSE_H_
+#define LIGHTHOUSE_H_
+
+#include "socket.h"
+
+#include <stdint.h>
+
+#define LH_STATE_PATH "/tmp/lighthouse_state"
+
+struct LhTxn;
+struct LhState;
+
+int txnWaitForVersion(const char* path, uint64_t expected_version, int timeout_ms);
+int txnBegin(const char* path, int write, struct LhTxn** out);
+int txnLoad(struct LhTxn* txn, struct LhState** out);
+int txnSave(struct LhTxn* txn, const struct LhState* state);
+int txnEnd(struct LhTxn* txn);
+
+int initialize(struct LhState* state, const union ncclSocketAddress* src_addrs, int nranks, uint64_t magic);
+void setMagic(struct LhState* state, uint64_t magic);
+int setFirstRank(struct LhState* state, const union ncclSocketAddress* firstRankNcclAddr, uint32_t rank, uint32_t nranks);
+int setLastRank(struct LhState* state, const union ncclSocketAddress* lastRankNcclAddr, uint32_t rank, uint32_t nranks);
+int setNewRank(struct LhState* state, const union ncclSocketAddress* newRankNcclAddr, uint32_t rank);
+void updateLastRankAddr(struct LhState* state);
+void updateVersion(struct LhState* state);
+
+void getMagic(const struct LhState* state, uint64_t* magic);
+void getVersion(const struct LhState* state, uint64_t* version);
+int queryNextRankAddrNew(const struct LhState* state, union ncclSocketAddress* nextAddr);
+int queryNextRankAddrLast(const struct LhState* state, union ncclSocketAddress* nextAddr);
+
+void printLhState(const struct LhState* state);
+
+#endif // LIGHTHOUSE_H_
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,55 @@
+#ifndef NCCL_SCALE_H_
+#define NCCL_SCALE_H_
+
+#include "core.h"
+#include <cstddef>
+#include "nccl.h"
+#include "socket.h"
+#include "bootstrap.h"
+
+#define ADDR_LIST_LEN (512)
+
+#define TIMER_INIT_TOTAL 0
+#define TIMER_INIT_KERNELS 1
+#define TIMER_INIT_BOOTSTRAP 2
+#define TIMER_INIT_ALLGATHER 3
+#define TIMER_INIT_TOPO 4
+#define TIMER_INIT_GRAPHS 5
+#define TIMER_INIT_CONNECT 6
+#define TIMER_INIT_ALLOC 7
+#define TIMERS_INIT_COUNT 8
+
+typedef ncclComm_t ncclCommIncomplete_t;
+
+struct ncclNewRankInfoInternal {
+ ncclCommIncomplete_t comm;
+};
+
+struct ncclCommTransUniqueIdInfo {
+ ncclCommIncomplete_t comm;
+ ncclUniqueId *uniqueId;
+};
+
+// in `init.cc`
+ncclResult_t ncclInit();
+// in `init.cc`
+ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config);
+// in `init.cc`
+int64_t ncclParamSetStackSize();
+// in `init.cc`
+ncclResult_t commAlloc(struct ncclComm *comm, struct ncclComm *parent, int ndev, int rank);
+// in `init.cc`
+ncclResult_t commAllocNew(struct ncclComm *comm, struct ncclComm *parent, int ndev, int rank);
+// in `init.cc`
+uint64_t hashUniqueId(ncclUniqueId const &id);
+// in `init.cc`
+ncclResult_t initTransportsNewRank(struct ncclComm* comm, const struct ncclCommTrans* peerComm);
+ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]);
+// in 'init.cc'
+ncclResult_t updateTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]);
+// in `reinit.cc`
+ncclResult_t devCommResetup(ncclComm_t comm);
+// in `init.cc`
+ncclResult_t devCommSetup(ncclComm_t comm);
+
+#endif // NCCL_SCALE_H_
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,507 @@
+#ifndef NCCL_SERIALIZE_H_
+#define NCCL_SERIALIZE_H_
+
+#include "bootstrap.h"
+#include "scale.h"
+#include "transport.h"
+#include "graph/topo.h" // 确保 ncclTopoGraph 定义在此头文件中
+
+
+#define SERIAL_CONVERT(DST, SRC) reinterpret_cast<decltype(DST)>(SRC)
+#define SERIAL_ASSIGN(DST, SRC) ((DST) = SERIAL_CONVERT(DST, SRC))
+
+// // 1. 核心修改:ncclCommTrans 结构体新增 cpuArch、cpuVendor、nNodes 字段
+// struct ncclCommTrans {
+// struct ncclTopoRanks* peerTopo; // 长度 nRanks(节点拓扑排序信息)
+// struct ncclPeerInfo* peerInfo; // 长度 nRanks+1(节点间通信信息)
+// int* nodesFirstRank; // 长度 nRanks(每个节点的首个_rank)
+// int* nodesTreePatterns; // 长度 nRanks(节点树通信模式)
+// int* ringPrev; // 长度 nRanks*MAXCHANNELS(环形通信前序节点)
+// int* ringNext; // 长度 nRanks*MAXCHANNELS(环形通信后序节点)
+// int* peerRings; // 长度 nRanks*MAXCHANNELS(节点间环形映射)
+// void* bootstrap; // 指向 bootstrapState(通信初始化状态)
+// int nRanks; // 通信器中 GPU 总数(动态数组长度依据)
+// // 新增三个基础字段(CPU 架构、厂商、节点数量,均为 int 类型)
+// int cpuArch; // CPU 架构标识(如 x86_64=62、ARM=123 等)
+// int cpuVendor; // CPU 厂商标识(如 Intel=1、AMD=2、ARM=3 等)
+// int nNodes; // 通信集群中的节点总数
+// int cudaDev; // 当前设备关联的 CUDA 设备号
+// int* rankToNode; // 长度 nRanks(rank 到节点的映射表)
+// // 此前新增的固定大小拓扑图数组(无指针成员)
+// struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
+// 原 bootstrapState 结构体定义(注释保留,供参考)
+// struct bootstrapState {
+// struct bootstrapRing_t ring;
+// struct bootstrapListen_t listen;
+// ncclNet_t *net;
+// uint64_t *peerProxyAddressesUDS;
+// union ncclSocketAddress *peerProxyAddresses;
+// union ncclSocketAddress *peerP2pAddresses;
+// struct unexConn *unexpectedConnections;
+// int cudaDev;
+// int rank;
+// int nranks;
+// uint64_t magic;
+// volatile uint32_t *abortFlag;
+// };
+
+// 基础序列化模板(适用于无指针的简单结构体,直接内存拷贝)
+template <typename info_t>
+inline size_t ncclInfoSerializeBase(char* buffer, const info_t* info) {
+ memcpy(buffer, info, sizeof(info_t));
+ return sizeof(info_t);
+}
+
+// 通用序列化模板(默认调用基础模板,复杂类型需特化)
+template <typename info_t>
+inline size_t ncclInfoSerialize(char *buffer, const info_t *info) {
+ return ncclInfoSerializeBase(buffer, info);
+}
+
+// 基础反序列化模板(仅返回结构体大小,复杂类型需特化解析逻辑)
+template <typename info_t>
+inline size_t ncclInfoDeserializeBase(info_t *info) {
+ return sizeof(info_t);
+}
+
+// 通用反序列化模板(默认调用基础模板)
+template <typename info_t>
+inline size_t ncclInfoDeserialize(info_t *info) {
+ //printf("ncclInfoDeserialize info->peerP2pAddresses 原始数据:");
+ return ncclInfoDeserializeBase(info);
+}
+
+// 序列化大小计算模板声明(需为特定类型特化实现)
+template <typename info_t>
+inline size_t ncclInfoSerializeSize(const info_t* info);
+
+
+// ------------------------------
+// bootstrapState 序列化特化(原逻辑完全保留,无修改)
+// ------------------------------
+template <>
+inline size_t ncclInfoSerializeSize(const struct bootstrapState *info) {
+ size_t offset = 0;
+ // int n = 2;
+ offset += sizeof(struct bootstrapState); // 基础字段大小
+ offset += sizeof(uint64_t) * info->nranks; // peerProxyAddressesUDS 数组
+ offset += sizeof(union ncclSocketAddress) * info->nranks; // peerProxyAddresses 数组
+ offset += sizeof(union ncclSocketAddress) * info->nranks; // peerP2pAddresses 数组
+ return offset;
+}
+
+template <typename T>
+inline size_t ncclArraySerialize(char *buffer, const T *arr, size_t n) {
+ size_t offset = 0;
+ for (size_t i = 0; i < n; i++) {
+ offset += ncclInfoSerialize(buffer + offset, arr + i);
+ }
+ //printBinaryData("ncclInfoSerialize buffer bootstrapState0:", buffer, sizeof(ncclSocketAddress));
+
+ return offset;
+}
+
+template <typename T>
+inline size_t ncclArrayDeserialize(T *arr, size_t n) {
+ size_t offset = 0;
+ char *buffer = (char *)arr;
+ for (size_t i = 0; i < n; i++) {
+ offset += ncclInfoDeserialize((T *)(buffer + offset));
+ }
+ return offset;
+}
+template <>
+inline size_t ncclInfoSerialize(char *buffer, const struct bootstrapState *info) {
+ size_t offset = 0;
+ // 1. 序列化基础字段
+ offset += ncclInfoSerializeBase(buffer + offset, info);
+ // //printBinaryData("ncclInfoSeserialize info->bootstrap内部", buffer , sizeof(struct bootstrapState));
+ // 2. 序列化动态数组(非空时才拷贝,避免无效操作)
+ if (info->peerProxyAddressesUDS != nullptr) {
+ const size_t size = sizeof(uint64_t) * info->nranks;
+ memcpy(buffer + offset, info->peerProxyAddressesUDS, size);
+ //printBinaryData("ncclInfoSerialize info->peerProxyAddressesUDS", buffer + offset, size);
+ offset += size;
+ }
+ //printBinaryData("ncclInfoSeserialize info->bootstrap内部", buffer , sizeof(struct bootstrapState)+sizeof(uint64_t) * info->nranks);
+ if (info->peerProxyAddresses != nullptr) {
+ const size_t size = sizeof(union ncclSocketAddress) * info->nranks;
+ memcpy(buffer + offset, info->peerProxyAddresses, size);
+ //printBinaryData("ncclInfoSeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+ offset += size;
+ }
+ if (info->peerP2pAddresses != nullptr) {
+ // //printf
+ const size_t size = sizeof(union ncclSocketAddress) * info->nranks;
+ memcpy(buffer + offset, info->peerP2pAddresses, size);
+ // //printBinaryData("ncclInfoSerialize info->peerP2pAddresses xinxi", buffer + offset, 1000);
+ // //printBinaryData("ncclInfoSerialize info->peerP2pAddresses 原有的", info->peerP2pAddresses, 1000);
+ offset += size;
+ }
+ return offset;
+}
+
+// template <>
+// inline size_t ncclInfoSerialize(char *buffer, const struct bootstrapState *info) {
+// size_t offset = 0;
+// offset += ncclInfoSerializeBase(buffer + offset, info);
+// // //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[2], sizeof(ncclSocketAddress));
+// //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[1], sizeof(ncclSocketAddress));
+// //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+
+// offset += ncclArraySerialize(buffer + offset, info->peerProxyAddressesUDS, info->nranks);
+// offset += ncclArraySerialize(buffer + offset, info->peerProxyAddresses, info->nranks);
+// size_t offset1 = offset;
+// offset += ncclArraySerialize(buffer + offset, info->peerP2pAddresses, info->nranks);
+// //printBinaryData("ncclInfoSerialize bootstrapState buffer:", buffer + offset1, sizeof(ncclSocketAddress));
+
+// return offset;
+// }
+
+
+// ------------------------------
+// ncclComm 序列化特化(核心修改:新增三个 int 字段的大小计算与数据拷贝)
+// ------------------------------
+template <>
+inline size_t ncclInfoSerializeSize(const ncclComm* info) {
+ size_t total = 0;
+ total += sizeof(ncclCommTrans);
+ // // 1. 基础字段大小计算(核心修改:加入 cpuArch、cpuVendor、nNodes 的 3*sizeof(int))
+ // total += sizeof(info->nRanks) // 1.1 GPU 总数(动态数组长度依据)
+ // + sizeof(info->cpuArch) // 1.2 新增:CPU 架构标识
+ // + sizeof(info->cpuVendor) // 1.3 新增:CPU 厂商标识
+ // + sizeof(info->nNodes) // 1.4 新增:节点总数
+ // + sizeof(info->cudaDev) // 1.5 CUDA 设备号
+ // + sizeof(info->graphs); // 1.6 固定大小拓扑图数组
+
+ // 2. 动态数组字段大小(仅 nRanks>0 时计算,原逻辑保留)
+ if (info->nRanks > 0) {
+ // peerRings:nRanks * MAXCHANNELS 个 int
+ total += info->nRanks * MAXCHANNELS * sizeof(int);
+
+ // peerTopo:nRanks 个 ncclTopoRanks 结构体
+ // if (info->peerTopo != nullptr) {
+ total += info->nRanks * sizeof(ncclTopoRanks);
+ // }
+
+ // nodesFirstRank:nRanks 个 int
+ // if (info->nodesFirstRank != nullptr) {
+ total += info->nRanks * sizeof(int);
+ // }
+
+ // nodesTreePatterns:nRanks 个 int
+ // if (info->nodesTreePatterns != nullptr) {
+ total += info->nRanks * sizeof(int);
+ // }
+
+ // ringPrev/ringNext:nRanks*MAXCHANNELS 个 int
+ // if (info->ringPrev != nullptr) {
+ total += info->nRanks * MAXCHANNELS * sizeof(int);
+ // }
+ // if (info->ringNext != nullptr) {
+ total += info->nRanks * MAXCHANNELS * sizeof(int);
+ // }
+
+ // rankToNode:nRanks 个 int
+ // if (info->rankToNode != nullptr) {
+ total += info->nRanks * sizeof(int);
+ //}
+ }
+
+ // 3. peerInfo 大小(nRanks+1 个 ncclPeerInfo 结构体,原逻辑保留)
+ // if (info->peerInfo != nullptr && info->nRanks > 0) {
+ total += (info->nRanks + 1) * sizeof(ncclPeerInfo);
+ //}
+ //printf("ncclInfoSerializeSize info->nRanks: %d total: %d\n", info->nRanks,total);
+
+ // 4. bootstrapState 大小(复用已有计算逻辑,原逻辑保留)
+ if (info->bootstrap != nullptr) {
+ total += ncclInfoSerializeSize(static_cast<const bootstrapState*>(info->bootstrap));
+ }
+ //printf("ncclInfoSerializeSize info->nRanks88888: %d total: %d\n", info->nRanks,total);
+
+ return total;
+}
+
+template <>
+inline size_t ncclInfoSerialize(char* buffer, const ncclComm* info) {
+ size_t offset = 0;
+ const size_t totalSize = ncclInfoSerializeSize(info); // 提前计算总大小,用于校验
+ ncclCommTrans *commTrans = (ncclCommTrans *)buffer;
+ offset += sizeof(ncclCommTrans);
+
+ // 1. 序列化基础字段(核心修改:按顺序拷贝新增的三个 int 字段)
+ // 1.1 先拷贝 nRanks(后续动态数组解析依赖此值)
+ memcpy(&commTrans->nRanks, &info->nRanks, sizeof(info->nRanks));
+ // 1.2 拷贝新增的 cpuArch(CPU 架构)
+ memcpy(&commTrans->cpuArch, &info->cpuArch, sizeof(info->cpuArch));
+ // 1.3 拷贝新增的 cpuVendor(CPU 厂商)
+ memcpy(&commTrans->cpuVendor, &info->cpuVendor, sizeof(info->cpuVendor));
+ // 1.4 拷贝新增的 nNodes(节点总数)
+ memcpy(&commTrans->nNodes, &info->nNodes, sizeof(info->nNodes));
+ // 1.5 拷贝 cudaDev(CUDA 设备号)
+ memcpy(&commTrans->cudaDev, &info->cudaDev, sizeof(info->cudaDev));
+ // 1.6 拷贝固定大小的 graphs 数组(无指针,直接内存拷贝)
+ memcpy(&commTrans->graphs, info->graphs, sizeof(info->graphs));
+ memcpy(&commTrans->commHash, &info->commHash, sizeof(info->commHash));
+ commTrans->peerTopo = info->peerTopo;
+ commTrans->peerInfo = info->peerInfo;
+ commTrans->peerRings = info->peerRings;
+ commTrans->bootstrap = info->bootstrap;
+ commTrans->rankToNode = info->rankToNode;
+ commTrans->nodesFirstRank = info->nodesFirstRank;
+ commTrans->nodesTreePatterns = info->nodesTreePatterns;
+ commTrans->ringPrev = info->ringPrev;
+ commTrans->ringNext = info->ringNext;
+
+ // 无 GPU 时直接返回(避免后续无效操作)
+ if (info->nRanks == 0) {
+ assert(offset == totalSize);
+ return offset;
+ }
+ size_t size = info->nRanks * MAXCHANNELS * sizeof(int);
+ // 2. 序列化动态数组字段(原逻辑完全保留,无修改)
+ if (info->peerRings != nullptr) {
+ memcpy(buffer + offset, info->peerRings, size);
+ }
+ //printBinaryData("ncclInfoSerialize info->peerRings", buffer + offset, size);
+ offset += size;
+ size = info->nRanks * sizeof(ncclTopoRanks);
+ if (info->peerTopo != nullptr) {
+ memcpy(buffer + offset, info->peerTopo, size);
+ }
+ //printBinaryData("ncclInfoSerialize info->peerTopo", buffer + offset, 20);
+ offset += size;
+ size = info->nRanks * sizeof(int);
+ if (info->nodesFirstRank != nullptr) {
+ memcpy(buffer + offset, info->nodesFirstRank, size);
+ }
+ // ?????
+ //printf("ncclInfoSerialize info->nodesFirstRank: %d\n", info->nodesFirstRank[1]);
+ //printBinaryData("ncclInfoSerialize info->nodesFirstRank", buffer + offset, size);
+ // //printf("ncclInfoSerialize info->nodesTreePatterns: %d\n", info->nodesTreePatterns[1]);
+ offset += size;
+ size = info->nRanks * sizeof(int);
+ if (info->nodesTreePatterns != nullptr) {
+ memcpy(buffer + offset, info->nodesTreePatterns, size);
+ }
+ offset += size;
+ size = info->nRanks * MAXCHANNELS * sizeof(int);
+ if (info->ringPrev != nullptr) {
+ memcpy(buffer + offset, info->ringPrev, size);
+ }
+ offset += size;
+ size = info->nRanks * MAXCHANNELS * sizeof(int);
+ if (info->ringNext != nullptr) {
+ memcpy(buffer + offset, info->ringNext, size);
+ }
+ offset += size;
+ size = info->nRanks * sizeof(int);
+ if (info->rankToNode != nullptr) {
+ memcpy(buffer + offset, info->rankToNode, size);
+ }
+ // ?????
+ //printf("ncclInfoSerialize info->rankToNode: %d\n", info->rankToNode[1]);
+ //printBinaryData("ncclInfoSerialize info->rankToNode", buffer + offset, size);
+ offset += size;
+ size = (info->nRanks + 1) * sizeof(ncclPeerInfo);
+ // 3. 序列化 peerInfo(原逻辑保留)
+ if (info->peerInfo != nullptr) {
+ memcpy(buffer + offset, info->peerInfo, size);
+ }
+ offset += size;
+ // //printf("ncclInfoSerialize info->nRanks: %d total: %d\n", info->nRanks,offset);
+ // //printf("ncclS offset: %d\n", offset);
+
+ // 4. 序列化 bootstrapState(原逻辑保留)
+ if (info->bootstrap != nullptr) {
+ //printBinaryData("ncclInfoSerialize info->bootstrap", info->bootstrap, 600);
+ offset += ncclInfoSerialize(buffer + offset, static_cast<const bootstrapState*>(info->bootstrap));
+ }
+
+ // 校验序列化大小是否匹配(避免字段遗漏或冗余)
+ // assert(offset == totalSize);
+ return offset;
+}
+
+
+// ------------------------------
+// 反序列化特化(核心修改:新增三个 int 字段的解析逻辑)
+// ------------------------------
+// bootstrapState 反序列化(原逻辑完全保留,无修改)
+template <>
+inline size_t ncclInfoDeserialize(struct bootstrapState *info) {
+ size_t offset = 0;
+ char *buffer = (char *)info;
+ //printBinaryData("ncclInfoDeserialize info->bootstrap内部", buffer + offset, sizeof(struct bootstrapState)+sizeof(uint64_t) * info->nranks);
+
+ // 1. 反序列化基础字段
+ offset += ncclInfoDeserializeBase(info);
+
+ // 2. 反序列化动态数组(指针指向缓冲区对应位置,不重新分配内存)
+ if (info->peerProxyAddressesUDS != nullptr) {
+ //SERIAL_ASSIGN(info->peerProxyAddressesUDS, buffer + offset);
+ info->peerProxyAddressesUDS = reinterpret_cast<decltype(info->peerProxyAddressesUDS)>(buffer + offset);
+ //printBinaryData("ncclInfoDeserialize info->peerProxyAddressesUDS", buffer + offset, sizeof(uint64_t) * info->nranks);
+ offset += sizeof(uint64_t) * info->nranks;
+ }
+ if (info->peerProxyAddresses != nullptr) {
+ //printBinaryData("ncclInfoDeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+ //SERIAL_ASSIGN(info->peerProxyAddresses, buffer + offset);
+ info->peerProxyAddresses = reinterpret_cast<decltype(info->peerProxyAddresses)>(buffer + offset);
+ // //printBinaryData("ncclInfoDeserialize info->peerProxyAddresses", buffer + offset, sizeof(union ncclSocketAddress) * info->nranks);
+ offset += sizeof(union ncclSocketAddress) * info->nranks;
+ }
+ if (info->peerP2pAddresses != nullptr) {
+ // //printf
+ // //printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+ info->peerP2pAddresses = reinterpret_cast<decltype(info->peerP2pAddresses)>(buffer + offset);
+ // SERIAL_ASSIGN(info->peerP2pAddresses, buffer + offset);
+ offset += sizeof(union ncclSocketAddress) * info->nranks;
+ }
+
+ return offset;
+}
+
+// template <>
+// inline size_t ncclInfoDeserialize(struct bootstrapState *info) {
+// size_t offset = 0;
+// char *buffer = (char *)info;
+// offset += ncclInfoDeserializeBase(info);
+// offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerProxyAddressesUDS, buffer + offset), info->nranks);
+// offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerProxyAddresses, buffer + offset), info->nranks);
+// //printBinaryData("ncclInfoDeserialize bootstrapState peerP2pAddresses hujiao:", buffer + offset, sizeof(ncclSocketAddress));
+// ////printBinaryData("ncclInfoDeserialize bootstrapState peerP2pAddresses hujiao2:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+// offset += ncclArrayDeserialize(SERIAL_ASSIGN(info->peerP2pAddresses, buffer + offset), info->nranks);
+// //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[2], sizeof(ncclSocketAddress));
+// //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[1], sizeof(ncclSocketAddress));
+// //printBinaryData("ncclInfoSerialize bootstrapState:", &info->peerP2pAddresses[0], sizeof(ncclSocketAddress));
+// return offset;
+// }
+
+// ncclCommTrans 反序列化(核心修改:按顺序解析新增的三个 int 字段)
+template <>
+inline size_t ncclInfoDeserialize(struct ncclCommTrans *info) {
+ size_t offset = 0;
+ char *buffer = (char *)info; // buffer 为输入数据缓冲区,info 为输出结构体
+ offset += sizeof(ncclCommTrans);
+ INFO(NCCL_INIT,"ncclInfoDeserialize info->nRanks: %d", info->nRanks);
+
+ // // 1. 反序列化基础字段(核心修改:匹配序列化顺序,解析新增字段)
+ // // 1.1 先解析 nRanks(动态数组长度依赖此值)
+ // memcpy(&info->nRanks, buffer + offset, sizeof(info->nRanks));
+ // offset += sizeof(info->nRanks);
+ // // 1.2 解析新增的 cpuArch(CPU 架构)
+ // memcpy(&info->cpuArch, buffer + offset, sizeof(info->cpuArch));
+ // offset += sizeof(info->cpuArch);
+ // // 1.3 解析新增的 cpuVendor(CPU 厂商)
+ // memcpy(&info->cpuVendor, buffer + offset, sizeof(info->cpuVendor));
+ // offset += sizeof(info->cpuVendor);
+ // // 1.4 解析新增的 nNodes(节点总数)
+ // memcpy(&info->nNodes, buffer + offset, sizeof(info->nNodes));
+ // offset += sizeof(info->nNodes);
+ // // 1.5 解析 cudaDev(CUDA 设备号)
+ // memcpy(&info->cudaDev, buffer + offset, sizeof(info->cudaDev));
+ // offset += sizeof(info->cudaDev);
+ // // 1.6 解析固定大小的 graphs 数组
+ // memcpy(info->graphs, buffer + offset, sizeof(info->graphs));
+ // offset += sizeof(info->graphs);
+
+ // 无 GPU 时直接返回
+
+ // //printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+ if (info->nRanks == 0) {
+ return offset;
+ }
+
+ // 2. 反序列化动态数组字段(原逻辑完全保留,无修改)
+ size_t size = info->nRanks * MAXCHANNELS * sizeof(int);
+ if (info->peerRings != nullptr) {
+ info->peerRings = reinterpret_cast<decltype(info->peerRings)>(buffer + offset);
+ }
+ //printBinaryData("ncclInfoDeserialize info->peerRings", buffer + offset, size);
+ offset += size;
+ size = info->nRanks * sizeof(struct ncclTopoRanks);
+ if (info->peerTopo != nullptr) {
+ info->peerTopo = reinterpret_cast<decltype(info->peerTopo)>(buffer + offset);
+ }
+ //printBinaryData("ncclInfoDeserialize info->peerTopo", buffer + offset, 20);
+ offset += size;
+ INFO(NCCL_INIT,"ncclInfoDeserialize offset %ld", offset);
+ size = info->nRanks * sizeof(int);
+ INFO(NCCL_INIT,"info->nodesFirstRank %p", info->nodesFirstRank);
+ if (info->nodesFirstRank != nullptr) {
+ INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesFirstRank: %p", info->nodesFirstRank);
+ info->nodesFirstRank = reinterpret_cast<decltype(info->nodesFirstRank)>(buffer + offset);
+ INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesFirstRank: %p", info->nodesFirstRank);
+ }
+ //printf("ncclInfoDeserialize info->nodesFirstRank: %d\n", info->nodesFirstRank[1]);
+ //printBinaryData("ncclInfoDeserialize info->nodesFirstRank", buffer + offset, size);
+ offset += size;
+ size = info->nRanks * sizeof(int);
+ if (info->nodesTreePatterns != nullptr) {
+ info->nodesTreePatterns = reinterpret_cast<decltype(info->nodesTreePatterns)>(buffer + offset);
+ INFO(NCCL_INIT,"ncclInfoDeserialize info->nodesTreePatterns: %p", info->nodesTreePatterns);
+ }
+ offset += size;
+ size = info->nRanks * MAXCHANNELS * sizeof(int);
+ if (info->ringPrev != nullptr) {
+ info->ringPrev = reinterpret_cast<decltype(info->ringPrev)>(buffer + offset);
+ }
+ offset += size;
+ size = info->nRanks * MAXCHANNELS * sizeof(int);
+ if (info->ringNext != nullptr) {
+ info->ringNext = reinterpret_cast<decltype(info->ringNext)>(buffer + offset);
+ }
+ offset += size;
+ size = info->nRanks * sizeof(int);
+ if (info->rankToNode != nullptr) {
+ info->rankToNode = reinterpret_cast<decltype(info->rankToNode)>(buffer + offset);
+ INFO(NCCL_INIT,"ncclInfoDeserialize info->rankToNode: %p", info->rankToNode);
+ }
+ // ?????
+ //printf("ncclInfoDeserialize info->rankToNode: %d\n", info->rankToNode[1]);
+ //printBinaryData("ncclInfoDeserialize info->rankToNode", buffer + offset, size);
+ offset += size;
+ size = (info->nRanks + 1) * sizeof(ncclPeerInfo);
+ // 3. 反序列化 peerInfo(原逻辑保留)
+ if (info->peerInfo != nullptr) {
+ info->peerInfo = reinterpret_cast<decltype(info->peerInfo)>(buffer + offset);
+ }
+ offset += size;
+ ////printBinaryData("ncclInfoDeserialize info->peerP2pAddresses", buffer + offset, 1000);
+ // 4. 反序列化 bootstrapState(原逻辑保留)
+ if (info->bootstrap != nullptr) {
+ //printBinaryData("ncclInfoDeserialize info->bootstrap", buffer + offset, 600);
+ offset += ncclInfoDeserialize((bootstrapState*)SERIAL_ASSIGN(info->bootstrap, buffer + offset));
+ // offset += ncclInfoDeserialize(reinterpret_cast<bootstrapState*>(buffer + offset));
+ }
+
+ return offset;
+}
+
+
+// 原 ncclCommTransUniqueIdInfo 序列化/反序列化逻辑(注释保留,用户确认无需启用)
+// template <>
+// inline size_t ncclInfoSerialize(char *buffer, const ncclCommTransUniqueIdInfo *info) {
+// size_t offset = 0;
+// offset += ncclInfoSerializeBase(buffer + offset, info);
+// offset += ncclInfoSerialize(buffer + offset, info->commTrans);
+// offset += ncclInfoSerialize(buffer + offset, info->uniqueId);
+// return offset;
+// }
+
+// template <>
+// inline size_t ncclInfoDeserialize(struct ncclCommTransUniqueIdInfo *info) {
+// size_t offset = 0;
+// char *buffer = (char *)info;
+// offset += ncclInfoDeserializeBase(info);
+// offset += ncclInfoDeserialize(SERIAL_ASSIGN(info->commTrans, buffer + offset));
+// offset += ncclInfoDeserialize(SERIAL_ASSIGN(info->uniqueId, buffer + offset));
+// return offset;
+// }
+
+
+#endif // NCCL_SERIALIZE_H_
\ No newline at end of file
@@ -1,8 +1,8 @@
/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#include "nccl.h"
#include "channel.h"
@@ -31,6 +31,10 @@
#include "param.h"
#include "nvtx_payload_schemas.h"
#include "utils.h"
+#include "net.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
#define STR2(v) #v
#define STR(v) STR2(v)
@@ -41,6 +45,21 @@
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
+// struct ncclCommTrans {
+// struct ncclTopoRanks* peerTopo;//长度nRanks
+// struct ncclPeerInfo* peerInfo;
+// int* nodesFirstRank;//长度nRanks
+// int* nodesTreePatterns;//长度nRanks
+// int* ringPrev;//长度nRanks*MAXCHANNELS
+// int* ringNext;//长度nRanks*MAXCHANNELS
+// int* peerRings;//长度nRanks*MAXCHANNELS
+// void* bootstrap;
+// int nRanks; // number of GPUs in communicator
+// int cudaDev;
+// int* rankToNode;
+// struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+// };
+
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
@@ -83,7 +102,7 @@ static void initOnceFunc() {
exit:;
}
-static ncclResult_t ncclInit() {
+ncclResult_t ncclInit() {
pthread_once(&initOnceControl, initOnceFunc);
return initResult;
}
@@ -187,8 +206,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
NCCLCHECK(ncclRasCommFini(comm));
/* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
- * free all intra-process communicators; therefore, we only need to focus on local
- * resource cleanup in commFree(). */
+ * free all intra-process communicators; therefore, we only need to focus on local
+ * resource cleanup in commFree(). */
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
if (comm->proxyState->threadUDS) {
@@ -326,7 +345,7 @@ exit:
return ret;
}
-static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
if (ndev < 1) {
WARN("invalid device count (%d) requested", ndev);
return ncclInvalidArgument;
@@ -433,7 +452,227 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
return ncclSuccess;
}
-static ncclResult_t devCommSetup(ncclComm_t comm) {
+ncclResult_t commAllocNew(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+ if (ndev < 1) {
+ WARN("invalid device count (%d) requested", ndev);
+ return ncclInvalidArgument;
+ }
+ if (rank >= ndev || rank < 0) {
+ WARN("rank %d exceeds ndev=%d", rank, ndev);
+ return ncclInvalidArgument;
+ }
+
+ ncclMemoryStackConstruct(&comm->memPermanent);
+ ncclMemoryStackConstruct(&comm->memScoped);
+ comm->destructorHead = nullptr;
+ comm->rank = rank;
+ comm->nRanks = ndev;
+
+ NCCLCHECK(ncclNetInit(comm));
+ INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
+
+ if (parent && parent->shareResources) {
+ if (parent->ncclNet != comm->ncclNet) {
+ WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
+ return ncclInvalidUsage;
+ }
+ }
+ // Try to create a CUDA object right away. If there is something wrong with
+ // the device we're on (failure cause #1) , better know it early.
+ CUDACHECK(cudaGetDevice(&comm->cudaDev));
+
+ NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
+ NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+ nvmlDevice_t nvmlDev;
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ NCCLCHECK(int64ToBusId(comm->busId, busId));
+ NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
+ NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev));
+
+ comm->compCap = ncclCudaCompCap();
+ TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
+
+ comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+ comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
+
+ memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
+
+ ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+ ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+
+ for (int i = 0; i < ncclGroupTaskTypeNum; i++) {
+ comm->groupNext[i] = reinterpret_cast<struct ncclComm*>(0x1);
+ }
+ comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+
+ static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+ static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+ NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+ NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+ // Mark channels as non initialized.
+ for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+
+ // if (parent == NULL || !parent->shareResources) {
+ // struct ncclSharedResources* sharedRes = NULL;
+ // NCCLCHECK(ncclCalloc(&sharedRes, 1));
+ // /* most of attributes are assigned later in initTransportsRank(). */
+ // sharedRes->owner = comm;
+ // sharedRes->tpNRanks = comm->nRanks;
+ // NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+ // NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+ // NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+ // CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+ // CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
+ // comm->sharedRes = sharedRes;
+ // sharedRes->refCount = 1;
+ // } else {
+ // comm->sharedRes = parent->sharedRes;
+ // ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+ // }
+
+ if (comm->topParentRanks == NULL) {
+ NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+ for (int i = 0; i < comm->nRanks; ++i)
+ comm->topParentRanks[i] = i;
+ }
+
+ ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+ ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
+
+ comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
+
+ do {
+ cudaMemPoolProps props = {};
+ props.allocType = cudaMemAllocationTypePinned;
+ props.handleTypes = cudaMemHandleTypeNone;
+ props.location.type = cudaMemLocationTypeDevice;
+ props.location.id = comm->cudaDev;
+ CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
+ uint64_t releaseThreshold = ~uint64_t(0);
+ CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
+ } while (0);
+
+ ncclIntruQueueConstruct(&comm->eventCallbackQueue);
+
+ return ncclSuccess;
+}
+
+// ncclResult_t commAllocNew(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) {
+// INFO(NCCL_INIT,"a11111111111");
+// if (ndev < 1) {
+// WARN("invalid device count (%d) requested", ndev);
+// return ncclInvalidArgument;
+// }
+// if (rank >= ndev || rank < 0) {
+// WARN("rank %d exceeds ndev=%d", rank, ndev);
+// return ncclInvalidArgument;
+// }
+
+// ncclMemoryStackConstruct(&comm->memPermanent);
+// ncclMemoryStackConstruct(&comm->memScoped);
+// comm->destructorHead = nullptr;
+// comm->rank = rank;
+// comm->nRanks = ndev;
+// INFO(NCCL_INIT,"a11111111111");
+// NCCLCHECK(ncclNetPluginLoad(comm));
+// NCCLCHECK(ncclNetInit(comm));
+// NCCLCHECK(ncclProfilerPluginInit(comm));
+// INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
+
+// if (parent && parent->config.splitShare) {
+// if (parent->ncclNet != comm->ncclNet) {
+// WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
+// return ncclInvalidUsage;
+// }
+// }
+// // Try to create a CUDA object right away. If there is something wrong with
+// // the device we're on (failure cause #1) , better know it early.
+// CUDACHECK(cudaGetDevice(&comm->cudaDev));
+
+// NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+// INFO(NCCL_INIT,"a222222222");
+// nvmlDevice_t nvmlDev;
+// char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+// NCCLCHECK(int64ToBusId(comm->busId, busId));
+// NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
+// NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev));
+
+// comm->compCap = ncclCudaCompCap();
+// TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
+
+// comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+// comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
+
+// comm->collNetSupport = 0;
+// memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
+// INFO(NCCL_INIT,"a333333333");
+// ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
+// ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
+
+// comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+// comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+
+// static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
+// static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
+// NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
+// NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+
+// // Mark channels as non initialized.
+// for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
+// //hjx
+// if (parent == NULL || !parent->config.splitShare) {
+// struct ncclSharedResources* sharedRes = NULL;
+// NCCLCHECK(ncclCalloc(&sharedRes, 1));
+// /* most of attributes are assigned later in initTransportsRank(). */
+// sharedRes->owner = comm;
+// sharedRes->tpNRanks = comm->nRanks;
+// NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+// NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+// NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+// comm->sharedRes = sharedRes;
+// sharedRes->refCount = 1;
+// } else {
+// comm->sharedRes = parent->sharedRes;
+// ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+// }
+// INFO(NCCL_INIT,"a444444444");
+
+// // if (comm->topParentRanks == NULL) {
+// // NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+// // for (int i = 0; i < comm->nRanks; ++i)
+// // comm->topParentRanks[i] = i;
+// // }
+
+// ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+// ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
+
+// comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
+// INFO(NCCL_INIT,"a5555555555");
+// do {
+// cudaMemPoolProps props = {};
+// props.allocType = cudaMemAllocationTypePinned;
+// props.handleTypes = cudaMemHandleTypeNone;
+// props.location.type = cudaMemLocationTypeDevice;
+// props.location.id = comm->cudaDev;
+// CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
+// uint64_t releaseThreshold = ~uint64_t(0);
+// CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
+// } while (0);
+// INFO(NCCL_INIT,"a666666666");
+// ncclIntruQueueConstruct(&comm->eventCallbackQueue);
+
+
+// // setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
+// comm->intraComm0 = comm;
+// comm->intraRank = 0;
+// comm->intraRanks = 1;
+
+// return ncclSuccess;
+// }
+
+ncclResult_t devCommSetup(ncclComm_t comm) {
ncclResult_t ret = ncclSuccess;
int nRanks = comm->nRanks;
struct ncclDevCommAndChannels tmpCommAndChans;
@@ -588,9 +827,9 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
- info->busId,
- ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
- info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
+ info->busId,
+ ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
+ info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
}
}
@@ -615,6 +854,25 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
return ncclSuccess;
}
+ncclResult_t setupChannelNew(struct ncclComm* comm, int channelId, int rank, int nRanks, int* ringRanks) {
+ INFO(NCCL_INIT, "rank %d nRanks %d", rank, nRanks);
+ NCCLCHECK(initChannelNew(comm, channelId));
+ //INFO(NCCL_INIT,"setupChannelllllllllll");
+
+ struct ncclRing* ring = &comm->channels[channelId].ring;
+ // Find our ring-distance from rank zero and reorganize ranks to start with rank.
+ int ixZero=0, ixRank=0;
+ for (int i=0; i < nRanks; i++) {
+ if (ringRanks[i] == 0) ixZero = i;
+ if (ringRanks[i] == rank) ixRank = i;
+ }
+ ring->index = (ixRank-ixZero + nRanks)%nRanks;
+ for (int i=0; i<nRanks; i++) {
+ ring->userRanks[i] = ringRanks[(i+ixRank)%nRanks];
+ }
+ return ncclSuccess;
+ }
+
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
@@ -670,10 +928,1294 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
#define TIMER_INIT_ALLOC 7
#define TIMERS_INIT_COUNT 8
-static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
- // We use 2 AllGathers
- // 1. { peerInfo, comm, compCap}
- // 2. { nChannels, graphInfo, topoRanks }
+ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
+ // We use 2 AllGathers
+ // 1. { peerInfo, comm, compCap}
+ // 2. { nChannels, graphInfo, topoRanks }
+ ncclResult_t ret = ncclSuccess;
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ int nNodes = 1;
+ cpu_set_t affinitySave;
+ struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
+ struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
+ struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
+ struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
+ struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
+ struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
+
+ struct graphInfo {
+ int pattern;
+ int nChannels;
+ int sameChannels;
+ float bwIntra;
+ float bwInter;
+ int typeIntra;
+ int typeInter;
+ int crossNic;
+ };
+
+ struct allGatherInfo {
+ struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+ struct ncclTopoRanks topoRanks;
+ int cpuArch;
+ int cpuVendor;
+ int localRanks;
+ };
+
+ int nChannelsOrig;
+ struct allGatherInfo *allGather3Data = NULL;
+ struct ncclTopoRanks** allTopoRanks = NULL;
+ int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+ int *rings = NULL;
+ int* nvbPeers = NULL;
+ struct ncclProxyConnector proxyConn;
+ int* pxnPeers = NULL;
+ int *topParentLocalRanks = NULL;
+ int p2pLevel = -1;
+
+ timers[TIMER_INIT_ALLGATHER] = clockNano();
+ // AllGather1 - begin
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
+ NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+ NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
+ __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
+
+ comm->cuMemSupport = 1;
+ for (int i = 0; i < nranks; i++) {
+ if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
+ WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
+ i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+ ret = ncclInvalidUsage;
+ goto fail;
+ }
+ if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
+ if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
+ if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+ WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+ ret = ncclInvalidUsage;
+ goto fail;
+ }
+ }
+ // AllGather1 - end
+ timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+
+ // Check for MNNVL support
+ NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
+ if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
+ NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
+ }
+
+ do {
+ // Compute intra-process ranks
+ int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+ for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+ for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+
+ comm->nvlsRegSupport = 1;
+ for (int i = 0; i < nranks; i++) {
+ if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+ && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+ // Rank is in same process
+ if (intraProcRanks == 0) intraProcRank0 = i;
+ if (i == rank) intraProcRank = intraProcRanks;
+ intraProcRanks++;
+ if (intraProcRank0 == rank && rank != i) {
+ comm->peerInfo[i].comm->intraNext = comm->intraNext;
+ comm->intraNext = comm->peerInfo[i].comm;
+ }
+ }
+
+ if (comm->nvlsRegSupport) {
+ for (int j = i + 1; j < nranks; j++) {
+ if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+ comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+ comm->nvlsRegSupport = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ // Buffer Registration is not supported with MNNVL
+ if (comm->MNNVL) comm->nvlsRegSupport = 0;
+
+ TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+ if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+ WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ intraProcRank, intraProcRanks, intraProcRank0);
+ ret = ncclInternalError;
+ goto fail;
+ }
+ struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+ assert(intraProcRank==0 ? comm==comm0 : true);
+ comm->intraComm0 = comm0;
+ comm->intraRank = intraProcRank;
+ comm->intraRanks = intraProcRanks;
+ comm->intraBarrierPhase = 0;
+ comm->intraBarrierCounter = 0;
+ comm->intraBarrierGate = 0;
+ } while(0);
+
+ timers[TIMER_INIT_TOPO] = clockNano();
+
+ // Dump XML if requested by user
+ const char* dumpXmlFile;
+ dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+ if (dumpXmlFile) {
+ NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+ }
+
+ // Topo detection / System graph creation
+ NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+ // Compute paths between GPUs and NICs
+ NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+ // Remove inaccessible GPUs and unused NICs
+ NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
+ // Recompute paths after trimming
+ NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+ // Init search
+ NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
+ // Decide on comm's CPU architecture.
+ NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
+ // Print final topology
+ NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
+ timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
+
+ // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+ // on the host is local.
+ NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
+ if (CPU_COUNT(&comm->cpuAffinity)) {
+ sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+ }
+
+ // Determine local CollNet support
+ if (!collNetSupport(comm)) {
+ comm->config.collnetEnable = 0;
+ }
+
+ // Determine local Nvls support
+ NCCLCHECK(ncclNvlsInit(comm));
+
+ timers[TIMER_INIT_GRAPHS] = clockNano();
+ // Get rings and trees
+ memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
+ ringGraph->id = 0;
+ ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
+ ringGraph->minChannels = 1;
+ ringGraph->maxChannels = MAXCHANNELS/2;
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
+
+ memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
+ treeGraph->id = 1;
+ treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
+ treeGraph->minChannels = ringGraph->nChannels;
+ treeGraph->maxChannels = ringGraph->nChannels;
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
+
+ memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
+ collNetChainGraph->id = 2;
+ collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
+ collNetChainGraph->collNet = 1;
+ collNetChainGraph->minChannels = ringGraph->nChannels;
+ collNetChainGraph->maxChannels = ringGraph->nChannels;
+
+ memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
+ collNetDirectGraph->id = 4;
+ collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
+ collNetDirectGraph->collNet = 1;
+ collNetDirectGraph->minChannels = 1;
+ collNetDirectGraph->maxChannels = MAXCHANNELS;
+ if (comm->config.collnetEnable) {
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
+ }
+
+ memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
+ nvlsGraph->id = 3;
+ nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
+ nvlsGraph->minChannels = 1;
+ nvlsGraph->maxChannels = MAXCHANNELS;
+ if (comm->nvlsSupport) {
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
+ }
+ timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
+
+ // Initialize num P2P LL buffers for this communicator
+ comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
+
+ if (comm->rank == ncclParamGraphDumpFileRank()) {
+ struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
+ NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
+ }
+
+ // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
+ // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
+ timers[TIMER_INIT_CONNECT] = clockNano();
+ // AllGather3 - begin
+ NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
+
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+ allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+ allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+ allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+ allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+ allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+ allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+ allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
+ }
+
+ allGather3Data[rank].cpuArch = comm->cpuArch;
+ allGather3Data[rank].cpuVendor = comm->cpuVendor;
+
+ comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+ NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
+
+ NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+
+ // Determine nNodes, firstRanks, ...
+ NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, comm->nRanks), ret, fail);
+ INFO(NCCL_INIT,"nodes llllyyyy%d", comm->nNodes);
+ for (int r=0; r<nranks; r++) {
+ int node;
+ int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+ for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+ if (node == comm->nNodes) {
+ comm->nNodes++;
+ nodesFirstRank[node] = firstRank;
+ comm->nodesFirstRank[node] = nodesFirstRank[node];
+ // Record tree pattern of each node as they can be different depending on sm arch
+ nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+ comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+ }
+ comm->rankToNode[r] = node;
+
+ if (comm->cpuArch != allGather3Data[r].cpuArch &&
+ comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+ comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+ }
+ if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
+ comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+ comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+ }
+ }
+
+ // Alert the user to the presence of mixed CPUs. In the past this has caused
+ // locks in some collective routines. This may help debug issues in the future.
+ if (rank==0) {
+ if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+ }
+ if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
+ }
+ }
+
+ // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+ comm->nodeRanks[node].localRanks++;
+ }
+ // Allocate ranks arrays for each node
+ for (int n=0; n<comm->nNodes; n++) {
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
+ comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+ comm->nodeRanks[n].localRanks = 0;
+ }
+ // And fill the ranks arrays
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+ }
+ comm->node = comm->rankToNode[rank];
+ comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+ comm->localRank = comm->rankToLocalRank[rank];
+ comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+ TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+ WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ ret = ncclInternalError;
+ goto fail;
+ }
+
+ INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
+ comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+
+ nChannelsOrig = comm->nChannels;
+ NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, comm->nRanks), ret, fail);
+ for (int i=0; i<nranks; i++) {
+ allTopoRanks[i] = &allGather3Data[i].topoRanks;
+ comm->peerTopo[i] = allGather3Data[i].topoRanks;
+ // Make sure we align all ranks so that the tuning is consistent across ranks
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+ graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+ graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+ graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+ graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+ graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+ graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
+ }
+ comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+ }
+ if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
+ if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
+
+ comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+ if (comm->nChannels < nChannelsOrig) {
+ // We started duplicating channels during Preset(), so we need to move the
+ // duplicated channels since we have removed some.
+ for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+ }
+
+ // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+ if (comm->config.collnetEnable == 1) {
+ int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+ if (comm->nNodes < collNetNodeThreshold) {
+ INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
+ comm->config.collnetEnable = 0;
+ }
+ }
+ NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
+ comm->isOneRPN = (comm->maxLocalRanks == 1);
+
+ NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
+ // AllGather3 - end
+ timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+ char line[1024];
+ line[0]='\0';
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclTree* tree = &comm->channels[c].tree;
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+ c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
+ INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT, "Trees%s", line);
+
+ NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
+
+ // Compute nChannels per peer for p2p
+ NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+ /* until now, all info of comm should be known. We can initialize shared resources and
+ * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+ * all proxy operations. */
+ if (comm->sharedRes->owner == comm) {
+ comm->sharedRes->tpNLocalRanks = comm->localRanks;
+ comm->sharedRes->magic = comm->magic;
+ comm->sharedRes->tpNChannels = comm->nChannels;
+ comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+ memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+ }
+ NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+ for (int i = 0; i < comm->localRanks; ++i) {
+ int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+ topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+ }
+ comm->topParentLocalRanks = topParentLocalRanks;
+
+ // Profiler plugin context has to be initialized before proxy thread
+ NCCLCHECK(ncclProfilerPluginInit(comm));
+
+ NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+ // Launch proxy service thread, after this, the proxy calls can be used.
+ if (parent && parent->shareResources) {
+ comm->proxyState = parent->sharedRes->proxyState;
+ ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+ } else {
+ NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+ }
+ NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
+
+ timers[TIMER_INIT_CONNECT] = clockNano();
+ do { // Build p2p schedule
+ int node = comm->node;
+ int nNodes = comm->nNodes;
+ int nRanks = comm->nRanks;
+ int local = comm->localRank;
+ int nLocals = comm->maxLocalRanks;
+ struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+ bool flat = false;
+ for (int node = 0; node < nNodes; node++) {
+ if (nodeRanks[node].localRanks != nLocals) {
+ flat = true;
+ nNodes = 1; node = 0;
+ nLocals = nRanks; local = rank;
+ break;
+ }
+ }
+ int nNodesPow2 = pow2Up(nNodes);
+ int nLocalsPow2 = pow2Up(nLocals);
+ comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
+ comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
+ uint32_t nodeRound = 0;
+ uint32_t nodeDelta = 0;
+ int round = 0;
+ // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+ // Since that formula only produces valid permutations when N is a pow of 2,
+ // we let N = pow2Up(n) and filter out results greater-eq to n.
+ // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+ do {
+ if (nodeDelta < nNodes) { // Filter nonsensical node deltas
+ int sendNode = (node + nodeDelta) % nNodes;
+ int recvNode = (node - nodeDelta + nNodes) % nNodes;
+ uint32_t localRound = 0;
+ uint32_t localDelta = 0;
+ do {
+ if (localDelta < nLocals) { // Filter nonsensical node-local deltas
+ int sendLocal = (local + localDelta) % nLocals;
+ int recvLocal = (local - localDelta + nLocals) % nLocals;
+ comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
+ comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
+ round += 1;
+ }
+ localRound += 1;
+ localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
+ } while (localRound != nLocalsPow2);
+ }
+ nodeRound += 1;
+ nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
+ } while (nodeRound != nNodesPow2);
+
+ if (round != nRanks) {
+ WARN("P2p schedule creation has bugs.");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ } while (0);
+
+ //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+ comm->runtimeConn = 1;
+ if (comm->runtimeConn) {
+ for (int c=0; c<comm->nChannels; c++) {
+ NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ }
+ // Attempt to setup NVLS, may silently fail and disable NVLS
+ NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+ // Check if we can setup CollNet
+ if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
+ } else {
+ for (int c=0; c<comm->nChannels; c++) {
+ NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ }
+ NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+
+ // Connect Trees
+ NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+
+ // Connect PAT only for communicators with 1 GPU per node
+ if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+
+ // Attempt to setup NVLS, may silently fail and disable NVLS
+ NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+ NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+
+ // And NVLS trees if needed
+ NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+
+ // Check if we can setup CollNet
+ if (comm->config.collnetEnable) {
+ ncclCollNetSetup(comm, parent, graphs);
+ NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+ if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+ NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+ }
+ }
+
+ // Connect to local net proxy
+ NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
+ NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+
+ // Then to remote ones when using PXN
+ if (ncclPxnDisable(comm) == 0) {
+ int nranks;
+ NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
+ for (int r=0; r<nranks; r++) {
+ NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
+ NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+ }
+ }
+
+ if (ncclParamNvbPreconnect()) {
+ // Connect p2p when using NVB path
+ int nvbNpeers;
+ NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
+ for (int r=0; r<nvbNpeers; r++) {
+ int peer = nvbPeers[r];
+ int sendRound=0, recvRound=0;
+ while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
+ while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
+ uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
+ uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
+ for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+ int channelId;
+ channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
+ if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
+ comm->connectSend[peer] |= (1UL<<channelId);
+ }
+ channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
+ if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
+ comm->connectRecv[peer] |= (1UL<<channelId);
+ }
+ }
+ }
+
+ NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+ }
+ }
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+
+ // Compute time models for algorithm and protocol combinations
+ NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+ INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+
+ if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+ const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
+ enum ncclLaunchMode mode, modeOld;
+ if (str && strcasecmp(str, "GROUP") == 0) {
+ mode = ncclLaunchModeGroup;
+ } else {
+ mode = ncclLaunchModeParallel;
+ }
+ // In theory we could be racing with other communicators not associated with
+ // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+ modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+ if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+ INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+ }
+ }
+
+ comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+ comm->baseStride = 0;
+
+ // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
+ // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
+ NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+ timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
+ /* Local intra-node barrier */
+ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+
+ // We should have allocated all buffers, collective fifos, ... we can
+ // restore the affinity.
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+exit:
+ if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+ * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+ * properly cleaned up. */
+ if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+ free(allTopoRanks);
+ free(nodesTreePatterns);
+ free(nodesFirstRank);
+ free(allGather3Data);
+ free(rings);
+ free(nvbPeers);
+ free(pxnPeers);
+ return ret;
+fail:
+ goto exit;
+}
+
+ncclResult_t updateTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
+ // We use 2 AllGathers
+ // 1. { peerInfo, comm, compCap}
+ // 2. { nChannels, graphInfo, topoRanks }
+ ncclResult_t ret = ncclSuccess;
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ int nNodes = 1;
+ cpu_set_t affinitySave;
+ struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
+ struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
+ struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
+ struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
+ struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
+ struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
+
+ struct graphInfo {
+ int pattern;
+ int nChannels;
+ int sameChannels;
+ float bwIntra;
+ float bwInter;
+ int typeIntra;
+ int typeInter;
+ int crossNic;
+ };
+
+ struct allGatherInfo {
+ struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+ struct ncclTopoRanks topoRanks;
+ int cpuArch;
+ int cpuVendor;
+ int localRanks;
+ };
+
+ int nChannelsOrig;
+ struct allGatherInfo *allGather3Data = NULL;
+ struct ncclTopoRanks** allTopoRanks = NULL;
+ int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+ int *rings = NULL;
+ int* nvbPeers = NULL;
+ struct ncclProxyConnector proxyConn;
+ int* pxnPeers = NULL;
+ int *topParentLocalRanks = NULL;
+ int p2pLevel = -1;
+ comm->initAlgoChannels[NCCL_ALGO_RING] = false;
+
+ timers[TIMER_INIT_ALLGATHER] = clockNano();
+ // AllGather1 - begin
+ NCCLCHECKGOTO(ncclRealloc(&comm->peerInfo,nranks, nranks+1), ret, fail); // Extra rank to represent CollNet root
+ // NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+
+ NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
+ __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
+
+ comm->cuMemSupport = 1;
+ for (int i = 0; i < nranks; i++) {
+ if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
+ WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
+ i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+ ret = ncclInvalidUsage;
+ goto fail;
+ }
+ if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
+ if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
+ if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+ WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+ ret = ncclInvalidUsage;
+ goto fail;
+ }
+ }
+ // AllGather1 - end
+ timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+
+ // Check for MNNVL support
+ NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
+ if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
+ NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
+ }
+
+ do {
+ // Compute intra-process ranks
+ int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+ for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+ for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+
+ comm->nvlsRegSupport = 1;
+ for (int i = 0; i < nranks; i++) {
+ if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+ && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+ // Rank is in same process
+ if (intraProcRanks == 0) intraProcRank0 = i;
+ if (i == rank) intraProcRank = intraProcRanks;
+ intraProcRanks++;
+ if (intraProcRank0 == rank && rank != i) {
+ comm->peerInfo[i].comm->intraNext = comm->intraNext;
+ comm->intraNext = comm->peerInfo[i].comm;
+ }
+ }
+
+ if (comm->nvlsRegSupport) {
+ for (int j = i + 1; j < nranks; j++) {
+ if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+ comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+ comm->nvlsRegSupport = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ // Buffer Registration is not supported with MNNVL
+ if (comm->MNNVL) comm->nvlsRegSupport = 0;
+
+ TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+ if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+ WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ intraProcRank, intraProcRanks, intraProcRank0);
+ ret = ncclInternalError;
+ goto fail;
+ }
+ struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+ assert(intraProcRank==0 ? comm==comm0 : true);
+ comm->intraComm0 = comm0;
+ comm->intraRank = intraProcRank;
+ comm->intraRanks = intraProcRanks;
+ comm->intraBarrierPhase = 0;
+ comm->intraBarrierCounter = 0;
+ comm->intraBarrierGate = 0;
+ } while(0);
+
+ timers[TIMER_INIT_TOPO] = clockNano();
+ // Dump XML if requested by user
+ const char* dumpXmlFile;
+ dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+ if (dumpXmlFile) {
+ NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+ }
+ comm->topParentRanks = NULL;
+ if (comm->topParentRanks == NULL) {
+ NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
+ for (int i = 0; i < comm->nRanks; ++i)
+ comm->topParentRanks[i] = i;
+ }
+
+ // Topo detection / System graph creation
+ NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+ // Compute paths between GPUs and NICs
+ NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+ // Remove inaccessible GPUs and unused NICs
+ // NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
+ // Recompute paths after trimming
+ NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
+ // Init search
+ NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
+ // Decide on comm's CPU architecture.
+ NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
+ // Print final topology
+ NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
+ timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
+
+ // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+ // on the host is local.
+ NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
+ if (CPU_COUNT(&comm->cpuAffinity)) {
+ sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+ }
+
+ // Determine local CollNet support
+ if (!collNetSupport(comm)) {
+ comm->config.collnetEnable = 0;
+ }
+
+ // Determine local Nvls support
+ NCCLCHECK(ncclNvlsInit(comm));
+
+ timers[TIMER_INIT_GRAPHS] = clockNano();
+ // Get rings and trees
+ memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
+ ringGraph->id = 0;
+ ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
+ ringGraph->minChannels = 1;
+ ringGraph->maxChannels = MAXCHANNELS/2;
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
+
+ memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
+ treeGraph->id = 1;
+ treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
+ treeGraph->minChannels = ringGraph->nChannels;
+ treeGraph->maxChannels = ringGraph->nChannels;
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
+
+ memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
+ collNetChainGraph->id = 2;
+ collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
+ collNetChainGraph->collNet = 1;
+ collNetChainGraph->minChannels = ringGraph->nChannels;
+ collNetChainGraph->maxChannels = ringGraph->nChannels;
+
+ memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
+ collNetDirectGraph->id = 4;
+ collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
+ collNetDirectGraph->collNet = 1;
+ collNetDirectGraph->minChannels = 1;
+ collNetDirectGraph->maxChannels = MAXCHANNELS;
+ if (comm->config.collnetEnable) {
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
+ }
+
+ memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
+ nvlsGraph->id = 3;
+ nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
+ nvlsGraph->minChannels = 1;
+ nvlsGraph->maxChannels = MAXCHANNELS;
+ if (comm->nvlsSupport) {
+ NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
+ }
+ timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
+
+ // Initialize num P2P LL buffers for this communicator
+ comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
+
+ if (comm->rank == ncclParamGraphDumpFileRank()) {
+ struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
+ NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
+ }
+
+ // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
+ // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
+ timers[TIMER_INIT_CONNECT] = clockNano();
+ // AllGather3 - begin
+ NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
+
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
+ allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
+ allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
+ allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
+ allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
+ allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
+ allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
+ allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
+ }
+
+ allGather3Data[rank].cpuArch = comm->cpuArch;
+ allGather3Data[rank].cpuVendor = comm->cpuVendor;
+
+ comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+ NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
+
+ NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+
+ // Determine nNodes, firstRanks, ...
+ NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, comm->nRanks), ret, fail);
+ INFO(NCCL_INIT,"nNODES llllyyyy%d", comm->nNodes);
+ comm->nNodes = 0;
+ for (int r=0; r<nranks; r++) {
+ int node;
+ int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+ for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+ if (node == comm->nNodes) {
+ comm->nNodes++;
+ nodesFirstRank[node] = firstRank;
+ comm->nodesFirstRank[node] = nodesFirstRank[node];
+ // Record tree pattern of each node as they can be different depending on sm arch
+ nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+ comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+ }
+ comm->rankToNode[r] = node;
+
+ if (comm->cpuArch != allGather3Data[r].cpuArch &&
+ comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+ comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+ }
+ if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
+ comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+ comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+ }
+ }
+
+ // Alert the user to the presence of mixed CPUs. In the past this has caused
+ // locks in some collective routines. This may help debug issues in the future.
+ if (rank==0) {
+ if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+ }
+ if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
+ }
+ }
+
+ // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+ comm->nodeRanks[node].localRanks++;
+ }
+ // Allocate ranks arrays for each node
+ for (int n=0; n<comm->nNodes; n++) {
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
+ comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+ comm->nodeRanks[n].localRanks = 0;
+ }
+ // And fill the ranks arrays
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+ }
+ comm->node = comm->rankToNode[rank];
+ comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+ comm->localRank = comm->rankToLocalRank[rank];
+ comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+ TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+ WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ ret = ncclInternalError;
+ goto fail;
+ }
+
+ INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
+ comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+
+ nChannelsOrig = comm->nChannels;
+ NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, comm->nRanks), ret, fail);
+ for (int i=0; i<nranks; i++) {
+ allTopoRanks[i] = &allGather3Data[i].topoRanks;
+ comm->peerTopo[i] = allGather3Data[i].topoRanks;
+ // Make sure we align all ranks so that the tuning is consistent across ranks
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
+ graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
+ graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
+ graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
+ graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
+ graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
+ graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
+ }
+ comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+ }
+ if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
+ if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
+
+ comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+ if (comm->nChannels < nChannelsOrig) {
+ // We started duplicating channels during Preset(), so we need to move the
+ // duplicated channels since we have removed some.
+ for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+ }
+
+ // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+ if (comm->config.collnetEnable == 1) {
+ int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+ if (comm->nNodes < collNetNodeThreshold) {
+ INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
+ comm->config.collnetEnable = 0;
+ }
+ }
+ NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
+ comm->isOneRPN = (comm->maxLocalRanks == 1);
+
+ NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
+ // AllGather3 - end
+ timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+ char line[1024];
+ line[0]='\0';
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclTree* tree = &comm->channels[c].tree;
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
+ c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
+ INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT, "Trees%s", line);
+
+ NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
+
+ // Compute nChannels per peer for p2p
+ NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
+
+ /* until now, all info of comm should be known. We can initialize shared resources and
+ * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+ * all proxy operations. */
+ if (comm->sharedRes->owner == comm) {
+ comm->sharedRes->tpNLocalRanks = comm->localRanks;
+ comm->sharedRes->magic = comm->magic;
+ comm->sharedRes->tpNChannels = comm->nChannels;
+ comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
+ memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
+ }
+ NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
+ for (int i = 0; i < comm->localRanks; ++i) {
+ int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
+ topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank];
+ }
+ comm->topParentLocalRanks = topParentLocalRanks;
+
+ // Profiler plugin context has to be initialized before proxy thread
+ NCCLCHECK(ncclProfilerPluginInit(comm));
+
+ NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+ // Launch proxy service thread, after this, the proxy calls can be used.
+ // if (parent && parent->shareResources) {
+ // comm->proxyState = parent->sharedRes->proxyState;
+ // ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+ // } else {
+ // NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+ // }
+ NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
+
+ timers[TIMER_INIT_CONNECT] = clockNano();
+ do { // Build p2p schedule
+ int node = comm->node;
+ int nNodes = comm->nNodes;
+ int nRanks = comm->nRanks;
+ int local = comm->localRank;
+ int nLocals = comm->maxLocalRanks;
+ struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+ bool flat = false;
+ for (int node = 0; node < nNodes; node++) {
+ if (nodeRanks[node].localRanks != nLocals) {
+ flat = true;
+ nNodes = 1; node = 0;
+ nLocals = nRanks; local = rank;
+ break;
+ }
+ }
+ int nNodesPow2 = pow2Up(nNodes);
+ int nLocalsPow2 = pow2Up(nLocals);
+ comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
+ comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
+ uint32_t nodeRound = 0;
+ uint32_t nodeDelta = 0;
+ int round = 0;
+ // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+ // Since that formula only produces valid permutations when N is a pow of 2,
+ // we let N = pow2Up(n) and filter out results greater-eq to n.
+ // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+ do {
+ if (nodeDelta < nNodes) { // Filter nonsensical node deltas
+ int sendNode = (node + nodeDelta) % nNodes;
+ int recvNode = (node - nodeDelta + nNodes) % nNodes;
+ uint32_t localRound = 0;
+ uint32_t localDelta = 0;
+ do {
+ if (localDelta < nLocals) { // Filter nonsensical node-local deltas
+ int sendLocal = (local + localDelta) % nLocals;
+ int recvLocal = (local - localDelta + nLocals) % nLocals;
+ comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
+ comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
+ round += 1;
+ }
+ localRound += 1;
+ localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
+ } while (localRound != nLocalsPow2);
+ }
+ nodeRound += 1;
+ nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
+ } while (nodeRound != nNodesPow2);
+
+ if (round != nRanks) {
+ WARN("P2p schedule creation has bugs.");
+ ret = ncclInternalError;
+ goto fail;
+ }
+ } while (0);
+
+ //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+ comm->runtimeConn = 1;
+ if (comm->runtimeConn) {
+ // for (int c=0; c<comm->nChannels; c++) {
+ // NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ // }
+ for (int c = 0; c < comm->nChannels; c++) {
+ // (&comm->channels[c])->peers = NULL;
+ NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ }
+ // Attempt to setup NVLS, may silently fail and disable NVLS
+ NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+ // Check if we can setup CollNet
+ if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
+ } else {
+ for (int c=0; c<comm->nChannels; c++) {
+ NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ }
+ NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+
+ // Connect Trees
+ NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+
+ // Connect PAT only for communicators with 1 GPU per node
+ if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+
+ // Attempt to setup NVLS, may silently fail and disable NVLS
+ NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+ NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+
+ // And NVLS trees if needed
+ NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+
+ // Check if we can setup CollNet
+ if (comm->config.collnetEnable) {
+ ncclCollNetSetup(comm, parent, graphs);
+ NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+ if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+ NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+ }
+ }
+
+ // Connect to local net proxy
+ NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
+ NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+
+ // Then to remote ones when using PXN
+ if (ncclPxnDisable(comm) == 0) {
+ int nranks;
+ NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
+ for (int r=0; r<nranks; r++) {
+ NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
+ NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+ }
+ }
+
+ if (ncclParamNvbPreconnect()) {
+ // Connect p2p when using NVB path
+ int nvbNpeers;
+ NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
+ for (int r=0; r<nvbNpeers; r++) {
+ int peer = nvbPeers[r];
+ int sendRound=0, recvRound=0;
+ while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
+ while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
+ uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
+ uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
+ for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+ int channelId;
+ channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
+ if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
+ comm->connectSend[peer] |= (1UL<<channelId);
+ }
+ channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
+ if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
+ comm->connectRecv[peer] |= (1UL<<channelId);
+ }
+ }
+ }
+
+ NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+ }
+ }
+ // for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+ // struct ncclProxyOps* ops = proxyOps + r;
+ // INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps %d",ops->pool,ops->nextOps);
+
+ // ops->pool = NULL;
+ // ops->nextOps = 0;
+
+ // }
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+
+ // Compute time models for algorithm and protocol combinations
+ NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+ INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+
+ if (comm->intraRank == 0) { // Load ncclParamLaunchMode
+ const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
+ enum ncclLaunchMode mode, modeOld;
+ if (str && strcasecmp(str, "GROUP") == 0) {
+ mode = ncclLaunchModeGroup;
+ } else {
+ mode = ncclLaunchModeParallel;
+ }
+ // In theory we could be racing with other communicators not associated with
+ // this one if the user is connecting to multiple ncclUniqueId's concurrently.
+ modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
+ if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
+ INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
+ }
+ }
+
+ comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+ comm->baseStride = 0;
+
+ // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
+ // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
+ NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+ timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
+ /* Local intra-node barrier */
+ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+ // comm->runtimeConn = 1;
+ // for (int c = 0; c < comm->nChannels; c++) {
+ // //NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings + c * nranks), ret, fail);
+ // //NCCLCHECKGOTO(setupChannel(peerComm, comm, c, rank, nranks, rings + c * nranks), ret, fail);
+ // NCCLCHECKGOTO(setupChannelNew(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ // }
+
+ // for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+ // struct ncclProxyOps* ops = proxyOps + r;
+ // INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps %d",ops->pool,ops->nextOps);
+
+ // ops->pool = NULL;
+ // ops->nextOps = 0;
+
+ // }
+ // // // Setup NVLS
+ // // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+ // // // Check if we can setup CollNet
+ // // if (comm->collNetSupport > 0) ncclCollNetSetup(comm, NULL, graphs);
+ // // Attempt to setup NVLS, may silently fail and disable NVLS
+ // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+ // // Check if we can setup CollNet
+ // if (comm->config.collnetEnable) ncclCollNetSetup(comm, NULL, graphs);
+
+ // NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+ // NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
+ // timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
+ // /* Local intra-node barrier */
+ // NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+
+ // We should have allocated all buffers, collective fifos, ... we can
+ // restore the affinity.
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+exit:
+ // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ // /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
+ // * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+ // * properly cleaned up. */
+ // if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+ free(allTopoRanks);
+ free(nodesTreePatterns);
+ free(nodesFirstRank);
+ free(allGather3Data);
+ free(rings);
+ free(nvbPeers);
+ free(pxnPeers);
+ return ret;
+fail:
+ goto exit;
+}
+
+ncclResult_t initTransportsNewRank(struct ncclComm* comm, const struct ncclCommTrans* peerComm) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
@@ -702,32 +2244,36 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclTopoRanks topoRanks;
int cpuArch;
int cpuVendor;
- int localRanks;
};
int nChannelsOrig;
- struct allGatherInfo *allGather3Data = NULL;
- struct ncclTopoRanks** allTopoRanks = NULL;
+ // struct allGatherInfo *allGather3Data = NULL;
+ struct ncclTopoRanks **allTopoRanks = NULL;
int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
int *rings = NULL;
- int* nvbPeers = NULL;
- struct ncclProxyConnector proxyConn;
- int* pxnPeers = NULL;
+ // int *nvbPeers = NULL;
+ // struct ncclProxyConnector proxyConn;
+ // int *pxnPeers = NULL;
int *topParentLocalRanks = NULL;
+ bool *nodeVis = nullptr, *firstRankVis = nullptr;
int p2pLevel = -1;
- timers[TIMER_INIT_ALLGATHER] = clockNano();
- // AllGather1 - begin
- NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
- NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
- NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
- __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
-
+ // Original AllGather1 - begin
+ INFO(NCCL_INIT, "all rank %d in comm %p", nranks, comm);
+ INFO(NCCL_INIT,"initTransportsNewRank peerComm->nodesTreePatterns: %p",peerComm->nodesTreePatterns);
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks + 1), ret, fail); // Extra rank to represent CollNet root
+ memcpy(comm->peerInfo, peerComm->peerInfo, (nranks - 1) * sizeof(*comm->peerInfo));
+ NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo + rank, comm->commHash), ret, fail);
+ INFO(NCCL_INIT,"peerComm->nodesTreePatterns111: %p",peerComm->nodesTreePatterns);
+ //memcpy(comm->peerInfo + nranks, peerComm->peerInfo + nranks - 1, sizeof(*comm->peerInfo));
+ //NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
+ comm->commHash = peerComm->commHash;
comm->cuMemSupport = 1;
+ // comm->nRanks = peerComm->nRanks+1;
for (int i = 0; i < nranks; i++) {
if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
- i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+ i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
ret = ncclInvalidUsage;
goto fail;
}
@@ -739,10 +2285,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
goto fail;
}
}
- // AllGather1 - end
- timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
+ // Original AllGather1 - end
- // Check for MNNVL support
+ // MNNVL support
NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
@@ -799,19 +2344,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->intraBarrierPhase = 0;
comm->intraBarrierCounter = 0;
comm->intraBarrierGate = 0;
- } while(0);
-
- timers[TIMER_INIT_TOPO] = clockNano();
-
- // Dump XML if requested by user
- const char* dumpXmlFile;
- dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
- if (dumpXmlFile) {
- NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
- }
-
+ } while (0);//next win
// Topo detection / System graph creation
+
+ //NCCLCHECKGOTO(ncclTopoGetSystem(peerComm, comm, &comm->topo), ret, fail);
+ //NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
+ //NCCLCHECKGOTO(ncclTopoGetSystemForNew(comm, &comm->topo), ret, fail);
// Compute paths between GPUs and NICs
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Remove inaccessible GPUs and unused NICs
@@ -824,7 +2363,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
// Print final topology
NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
- timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
// on the host is local.
@@ -838,11 +2376,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
if (!collNetSupport(comm)) {
comm->config.collnetEnable = 0;
}
+ INFO(NCCL_INIT,"peerComm->nodesTreePatterns222: %p",peerComm->nodesTreePatterns);
// Determine local Nvls support
NCCLCHECK(ncclNvlsInit(comm));
- timers[TIMER_INIT_GRAPHS] = clockNano();
// Get rings and trees
memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
ringGraph->id = 0;
@@ -889,7 +2427,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
}
- timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
// Initialize num P2P LL buffers for this communicator
comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
@@ -899,66 +2436,96 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
}
- // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
- // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
- timers[TIMER_INIT_CONNECT] = clockNano();
- // AllGather3 - begin
- NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
-
- for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
- allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
- allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
- allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
- allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
- allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
- allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
- allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
- allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
- }
-
- allGather3Data[rank].cpuArch = comm->cpuArch;
- allGather3Data[rank].cpuVendor = comm->cpuVendor;
+ // Original AllGather3 - begin
+ struct ncclTopoRanks myTopoRanks;
+ struct ncclTopoRanks *peerTopoRanks;
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
- NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
-
- NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
+ NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &myTopoRanks), ret, fail);
- // Determine nNodes, firstRanks, ...
+ // Determine nNodes, nodesFirstRank, rankToNode, cpuArch, cpuVendor
+ // TODO: determine nodesTreePatterns
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesFirstRank, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
- for (int r=0; r<nranks; r++) {
+ NCCLCHECKGOTO(ncclCalloc(&comm->nodesTreePatterns, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerTopo, nranks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&nodeVis, comm->nRanks), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&firstRankVis, comm->nRanks), ret, fail);
+ if (comm->cpuArch != peerComm->cpuArch &&
+ comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+ // If peerComm specifies a cpuArch, all ranks must shares the same cpuArch;
+ // otherwise, the communicator has a mixed cpuArch.
+ comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+ }
+ if (comm->cpuVendor != peerComm->cpuVendor &&
+ comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+ // Same to cpuArch.
+ comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+ }
+ INFO(NCCL_INIT,"peerComm->nodesTreePatterns333: %p",peerComm->nodesTreePatterns);
+ // comm->nNodes = peerComm->nNodes;
+ // for (int r = 0; r < nranks; r++) {
+ // if (r == nranks - 1) {
+ // int firstRank = myTopoRanks.ringRecv[0];
+ // if (!firstRankVis[rank]) {
+ // nodesFirstRank[comm->nNodes++] = firstRank;
+ // }
+ // } else {
+ // int node = peerComm->rankToNode[r];
+ // comm->rankToNode[r] = node;
+ // if (!nodeVis[node]) {
+ // nodesFirstRank[node] = r;
+ // nodeVis[node] = true;
+ // firstRankVis[rank] = true;
+ // }
+ // }
+ // }
+ INFO(NCCL_INIT,"nNodes:%d,comm->nNodes:%d",nNodes,comm->nNodes);
+ comm->nNodes = peerComm->nNodes;
+ memcpy(nodesFirstRank, peerComm->nodesFirstRank, peerComm->nRanks * sizeof(int));//postset关键数据结构
+ INFO(NCCL_INIT,"peerComm->nodesTreePatterns444: %p",peerComm->nodesTreePatterns);
+ memcpy(comm->nodesFirstRank, nodesFirstRank, peerComm->nRanks * sizeof(int));//postset关键数据结构
+ INFO(NCCL_INIT,"peerComm->nodesTreePatterns666: %p",peerComm->nodesTreePatterns);
+ memcpy(nodesTreePatterns, peerComm->nodesTreePatterns, peerComm->nRanks * sizeof(int));//postset关键数据结构
+ memcpy(comm->nodesTreePatterns, nodesTreePatterns, peerComm->nRanks * sizeof(int));//postset关键数据结构
+ for (int r = 0; r < nranks; r++) {
int node;
- int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
- for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
- if (node == comm->nNodes) {
- comm->nNodes++;
- nodesFirstRank[node] = firstRank;
- // Record tree pattern of each node as they can be different depending on sm arch
- nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
+ // 对于新rank(如rank5),需要单独处理
+ if (r == comm->rank) {
+ // 新rank的topoRanks.ringRecv[0]应为自身或根据拓扑确定
+ int firstRank = myTopoRanks.ringRecv[0];
+
+ // 查找是否已有节点包含该firstRank
+ for (node = 0; node < comm->nNodes && comm->nodesFirstRank[node] != firstRank; node++);
+
+ // 如果是新节点
+ if (node == comm->nNodes) {
+ // 添加新节点信息
+ comm->nNodes++;
+ nodesFirstRank[node] = firstRank;
+ comm->nodesFirstRank[node] = firstRank;
+ INFO(NCCL_INIT,"nodesFirstRank node %d firstRank %d",node,firstRank);
+ nodesTreePatterns[node] = comm->graphs[NCCL_ALGO_TREE].pattern;
+ comm->nodesTreePatterns[node] = nodesTreePatterns[node];
+ }
+ } else {
+ // 对于已有rank(来自peerComm的rank0-4),直接使用peerComm的映射
+ node = peerComm->rankToNode[r];//就是复制24节点,一模一样//很关键的数据结构
}
+
comm->rankToNode[r] = node;
-
- if (comm->cpuArch != allGather3Data[r].cpuArch &&
- comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
- comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
- }
- if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
- comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
- comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
- }
}
+ //nodesTreePatterns[0] = 1;
// Alert the user to the presence of mixed CPUs. In the past this has caused
// locks in some collective routines. This may help debug issues in the future.
- if (rank==0) {
- if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
- INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
- }
- if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
- INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
- }
+ if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+ }
+ if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+ INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
}
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
@@ -989,35 +2556,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
- rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
- comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ comm->localRank, comm->localRanks, comm->localRankToRank[0]);
ret = ncclInternalError;
goto fail;
}
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
- comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+ comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
nChannelsOrig = comm->nChannels;
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
- for (int i=0; i<nranks; i++) {
- allTopoRanks[i] = &allGather3Data[i].topoRanks;
- // Make sure we align all ranks so that the tuning is consistent across ranks
- for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
- graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
- graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
- graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
- graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
- graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
- graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
- graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
- }
- comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
+ NCCLCHECKGOTO(ncclCalloc(&peerTopoRanks, comm->nRanks-1),ret,fail);
+ for (int i=0; i<comm->nRanks-1; i++) {
+ peerTopoRanks[i] = peerComm->peerTopo[i];
+ //peerTopoRanks[i] = myTopoRanks;
+ allTopoRanks[i] = &(peerTopoRanks[i]);//也是这个函数结束释放,所以不用拷贝
+ comm->peerTopo[i] = peerTopoRanks[i];
+ }
+
+ // for()
+ allTopoRanks[comm->nRanks-1] = &myTopoRanks;
+ comm->peerTopo[comm->nRanks-1] = myTopoRanks;
+ // for(int i=0; i<comm->nRanks; i++) {
+ // comm->peerTopo[i] = *allTopoRanks[i];
+ // }
+ for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) {
+ INFO(NCCL_INIT," before Algorithm %d: Pattern=%d, nChannels=%d, sameChannels=%d, "
+ "bwIntra=%.2f, bwInter=%.2f, typeIntra=%d, typeInter=%d, crossNic=%d\n",
+ a,
+ graphs[a]->pattern,
+ graphs[a]->nChannels,
+ graphs[a]->sameChannels,
+ graphs[a]->bwIntra,
+ graphs[a]->bwInter,
+ graphs[a]->typeIntra,
+ graphs[a]->typeInter,
+ graphs[a]->crossNic);
+ graphs[a]->nChannels = std::max(graphs[a]->nChannels, peerComm->graphs[a].nChannels); // only available in single node case
+ graphs[a]->sameChannels = std::max(graphs[a]->sameChannels, peerComm->graphs[a].sameChannels); // only available in single node case
+ graphs[a]->bwIntra = std::min(graphs[a]->bwIntra, peerComm->graphs[a].bwIntra);
+ graphs[a]->bwInter = std::min(graphs[a]->bwInter, peerComm->graphs[a].bwInter);
+ graphs[a]->typeIntra = std::max(graphs[a]->typeIntra, peerComm->graphs[a].typeIntra);
+ graphs[a]->typeInter = std::max(graphs[a]->typeInter, peerComm->graphs[a].typeInter);
+ graphs[a]->crossNic = std::max(graphs[a]->crossNic, peerComm->graphs[a].crossNic);
+ INFO(NCCL_INIT," Algorithm %d: Pattern=%d, nChannels=%d, sameChannels=%d, "
+ "bwIntra=%.2f, bwInter=%.2f, typeIntra=%d, typeInter=%d, crossNic=%d\n",
+ a,
+ graphs[a]->pattern,
+ graphs[a]->nChannels,
+ graphs[a]->sameChannels,
+ graphs[a]->bwIntra,
+ graphs[a]->bwInter,
+ graphs[a]->typeIntra,
+ graphs[a]->typeInter,
+ graphs[a]->crossNic);
}
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
+ // Set allTopoRanks for single node case
+
+ comm->nChannels = 1;
+ // for (int r = 0; r < nranks; r++) {
+ // NCCLCHECKGOTO(ncclCalloc(&allTopoRanks[r], 1), ret, fail);
+ // for (int c = 0; c < comm->nChannels; c++) {
+ // allTopoRanks[r]->ringRecv[c] = 0;
+ // allTopoRanks[r]->ringSend[c] = nranks - 1;
+ // allTopoRanks[r]->ringPrev[c] = r - 1;
+ // allTopoRanks[r]->ringNext[c] = r + 1 < nranks ? r + 1 : -1;
+ // allTopoRanks[r]->treeToChild0[c] = 1;
+ // allTopoRanks[r]->treeToChild1[c] = 1;
+ // }
+ // }
+
comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
+ comm->nChannels = 1;
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
@@ -1031,14 +2645,26 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->config.collnetEnable = 0;
}
+ // // As long as there is more than 1 rank on any node, we need to disable collnet reg
}
NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
comm->isOneRPN = (comm->maxLocalRanks == 1);
- NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
- NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
- // AllGather3 - end
- timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
+ NCCLCHECKGOTO(ncclCalloc(&rings, nranks* MAXCHANNELS), ret, fail);
+ //comm->nChannels = 4;
+ INFO(NCCL_INIT, "111Connecting rings %d", comm->nChannels);
+ //struct ncclChannel* channel1 = comm->channels + 0;
+ INFO(NCCL_INIT, "end Connecting channel Id prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
+ NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, nullptr), ret, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->peerRings, comm->nRanks * MAXCHANNELS), ret, fail);
+ for (int r = 0; r < nranks; r++) {
+ for (int c = 0; c < MAXCHANNELS; c++) {
+ int src_idx = r * MAXCHANNELS + c;
+ int dst_idx = r * MAXCHANNELS + c;
+ comm->peerRings[dst_idx] = rings[src_idx];
+ }
+ }
+ // Original AllGather3 - end
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
@@ -1048,7 +2674,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclTree* tree = &comm->channels[c].tree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
- INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
+ INFO(NCCL_GRAPH, "Ring1 %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
@@ -1057,10 +2683,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Compute nChannels per peer for p2p
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
-
+ INFO(NCCL_INIT, "end Connecting channel Id prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
/* until now, all info of comm should be known. We can initialize shared resources and
- * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
- * all proxy operations. */
+ * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before
+ * all proxy operations. */
if (comm->sharedRes->owner == comm) {
comm->sharedRes->tpNLocalRanks = comm->localRanks;
comm->sharedRes->magic = comm->magic;
@@ -1068,6 +2694,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->sharedRes->tpP2pNChannels = comm->p2pnChannels;
memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks);
}
+ // INFO(NCCL_INIT, "end Connecting channel Id prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail);
for (int i = 0; i < comm->localRanks; ++i) {
int tpRank = comm->topParentRanks[comm->localRankToRank[i]];
@@ -1075,20 +2702,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
comm->topParentLocalRanks = topParentLocalRanks;
- // Profiler plugin context has to be initialized before proxy thread
- NCCLCHECK(ncclProfilerPluginInit(comm));
-
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
// Launch proxy service thread, after this, the proxy calls can be used.
- if (parent && parent->shareResources) {
- comm->proxyState = parent->sharedRes->proxyState;
- ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
- } else {
- NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
- }
+ NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
-
- timers[TIMER_INIT_CONNECT] = clockNano();
+ INFO(NCCL_INIT, "end Connecting channel Id prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
do { // Build p2p schedule
int node = comm->node;
int nNodes = comm->nNodes;
@@ -1145,136 +2763,48 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
} while (0);
- comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
- if (comm->runtimeConn) {
- for (int c=0; c<comm->nChannels; c++) {
- NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
- }
- // Attempt to setup NVLS, may silently fail and disable NVLS
- NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
- // Check if we can setup CollNet
- if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
- } else {
- for (int c=0; c<comm->nChannels; c++) {
- NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
- }
- NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
-
- // Connect Trees
- NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
-
- // Connect PAT only for communicators with 1 GPU per node
- if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
-
- // Attempt to setup NVLS, may silently fail and disable NVLS
- NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
- NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
-
- // And NVLS trees if needed
- NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
-
- // Check if we can setup CollNet
- if (comm->config.collnetEnable) {
- ncclCollNetSetup(comm, parent, graphs);
- NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
- if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
- NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
- }
- }
-
- // Connect to local net proxy
- NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
- NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
-
- // Then to remote ones when using PXN
- if (ncclPxnDisable(comm) == 0) {
- int nranks;
- NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
- for (int r=0; r<nranks; r++) {
- NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
- NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
- }
- }
-
- if (ncclParamNvbPreconnect()) {
- // Connect p2p when using NVB path
- int nvbNpeers;
- NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
- for (int r=0; r<nvbNpeers; r++) {
- int peer = nvbPeers[r];
- int sendRound=0, recvRound=0;
- while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
- while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
- uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
- uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
- for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
- int channelId;
- channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
- if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
- comm->connectSend[peer] |= (1UL<<channelId);
- }
- channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
- if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
- comm->connectRecv[peer] |= (1UL<<channelId);
- }
- }
- }
-
- NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
- }
- }
-
- TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
-
- // Compute time models for algorithm and protocol combinations
- NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
-
- INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
-
- if (comm->intraRank == 0) { // Load ncclParamLaunchMode
- const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
- enum ncclLaunchMode mode, modeOld;
- if (str && strcasecmp(str, "GROUP") == 0) {
- mode = ncclLaunchModeGroup;
- } else {
- mode = ncclLaunchModeParallel;
- }
- // In theory we could be racing with other communicators not associated with
- // this one if the user is connecting to multiple ncclUniqueId's concurrently.
- modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
- if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
- INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
- }
+ //comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+ comm->runtimeConn = 1;
+ for (int c = 0; c < comm->nChannels; c++) {
+ //NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings + c * nranks), ret, fail);
+ //NCCLCHECKGOTO(setupChannel(peerComm, comm, c, rank, nranks, rings + c * nranks), ret, fail);
+ NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+ }
+ // // Setup NVLS
+ // NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+ // // Check if we can setup CollNet
+ // if (comm->collNetSupport > 0) ncclCollNetSetup(comm, NULL, graphs);
+ // Attempt to setup NVLS, may silently fail and disable NVLS
+ NCCLCHECKGOTO(ncclNvlsSetup(comm, NULL), ret, fail);
+ // Check if we can setup CollNet
+ if (comm->config.collnetEnable) ncclCollNetSetup(comm, NULL, graphs);
+ INFO(NCCL_INIT, "end Connecting channel Id prev %d, next %d", (comm->channels + 0)->ring.prev, (comm->channels + 0)->ring.next);
+ // for (int c = 0; c < comm->nChannels; c++) {
+ // struct ncclChannel* channel = comm->channels + c;
+ // NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
+ // }
+
+ for (int i = 1; i < comm->nRanks; i++) {
+ int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+ int sendPeer = (comm->rank + i) % comm->nRanks;
+ uint64_t recvMask = comm->connectRecv[recvPeer];
+ uint64_t sendMask = comm->connectSend[sendPeer];
+ //INFO(NCCL_INIT,"十六进制(小写): 0x%" PRIx64 "\n", recvMask);
+ INFO(NCCL_INIT, "first i %d:两个十六进制值: 0x%" PRIx64 " 0x%" PRIx64 "\n", i, recvMask, sendMask);
}
- comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
- comm->baseStride = 0;
-
- // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
- // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
- NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
-
- timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
- /* Local intra-node barrier */
- NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-
- // We should have allocated all buffers, collective fifos, ... we can
- // restore the affinity.
- TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
-
exit:
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
/* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
- * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
- * properly cleaned up. */
- if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
- free(allTopoRanks);
- free(nodesTreePatterns);
- free(nodesFirstRank);
- free(allGather3Data);
- free(rings);
- free(nvbPeers);
- free(pxnPeers);
+ * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
+ * properly cleaned up. */
+ if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+ if (allTopoRanks) free(allTopoRanks);
+ if (nodesTreePatterns) free(nodesTreePatterns);
+ if (nodesFirstRank) free(nodesFirstRank);
+ if (rings) free(rings);
+ if (nodeVis) free(nodeVis);
+ if (firstRankVis) free(firstRankVis);
return ret;
fail:
goto exit;
@@ -1426,7 +2956,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
eatHash(hacc, &job->color);
comm->commHash = digestHash(hacc);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
- comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
+ comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1439,7 +2969,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// obtain a unique hash using the first commId
comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
- comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
+ comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1462,23 +2992,23 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
__atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
- comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
+ comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
} else {
// the name for the replay tool is ncclCommInitRank for all the variations
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
- comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
+ comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
}
sum_timers = 0.0;
for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
sum_timers += (timers[it] / 1e9);
INFO(NCCL_INIT | NCCL_PROFILE,
- "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
- "connections %.2f, rest %.2f)",
- job->funcName, comm->rank, comm->nRanks,
- timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
- timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
- timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
+ "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
+ "connections %.2f, rest %.2f)",
+ job->funcName, comm->rank, comm->nRanks,
+ timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
+ timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
+ timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
exit:
if (job->newcomm) {
/* assign it to user pointer. */
@@ -1626,7 +3156,7 @@ static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) {
return ncclSuccess;
}
-static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
+ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
ncclResult_t ret = ncclSuccess;
/* config must not be NULL in this function */
ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
@@ -1817,6 +3347,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
// copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
+
commIdEnv = ncclGetEnv("NCCL_COMM_ID");
if (commIdEnv && myrank == 0) {
@@ -1859,7 +3390,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
-
+ printf("ncclCommInitRank magic %" PRIu64 "\n", (*newcomm)->magic);
return ncclSuccess;
}
@@ -2121,7 +3652,7 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
ncclComm_t nextIntraComm = intracomm0;
/* this is the last call to ncclCommDestroy/Abort, we need to make sure all comms
- * in the process have been finalized before we free local resources. */
+ * in the process have been finalized before we free local resources. */
while (nextIntraComm) {
curIntraComm = nextIntraComm;
curRank = curIntraComm->rank;
@@ -2215,7 +3746,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
NCCLCHECK(setCommAbortFlags(comm,1));
comm->destroyFlag = 1;
/* init thread must be joined before we destroy the comm,
- * and we should ignore the init error here. */
+ * and we should ignore the init error here. */
(void)ncclCommEnsureReady(comm);
// once the comm is ready, we can access ranks etc
@@ -2352,12 +3883,17 @@ ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int exclude
NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
}
NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit);
+ // INFO(NCCL_INIT,"ncclCommShrink magic %d", (*newcomm)->magic);
+ // printf("ncclCommShrink point %p\n", (*newcomm));
+ // printf("ncclCommShrink magic %" PRIu64 "\n", (*newcomm)->magic);
if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount));
exit:
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
+ printf("ncclCommShrink magic %" PRIu64 "\n", (*newcomm)->magic);
+ // printf("ncclCommShrink point %p\n", (*newcomm));
return res;
}
@@ -2367,7 +3903,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
ncclResult_t res = ncclSuccess;
NCCLCHECK(ncclGroupStartInternal());
- NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);
+ NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);//
if (*newcomm)
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
@@ -2394,8 +3930,8 @@ const char* ncclGetErrorString(ncclResult_t code) {
}
/* Returns a human-readable message of the last error that occurred.
- * comm is currently unused and can be set to NULL
- */
+* comm is currently unused and can be set to NULL
+*/
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
const char* ncclGetLastError(ncclComm_t comm) {
return ncclLastError;
@@ -2436,7 +3972,6 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-
NCCLCHECK(ncclCommEnsureReady(comm));
*devid = comm->cudaDev;
new file mode 100644
@@ -0,0 +1,339 @@
+#include "lighthouse.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define LH_STATE_OK 0
+#define LH_STATE_EMPTY 1
+#define LH_STATE_ERROR -1
+
+struct LhTxn {
+ FILE* fp;
+ int fd;
+ int writable;
+};
+
+struct LhRank {
+ uint32_t rank_id;
+ union ncclSocketAddress addr;
+};
+
+struct LhState {
+ uint64_t version;
+ uint32_t nranks;
+ uint64_t magic;
+
+ struct LhRank first_rank;
+ struct LhRank last_rank;
+ struct LhRank new_rank;
+};
+
+static int lockFd(int fd, short type) {
+ struct flock fl;
+ memset(&fl, 0, sizeof(fl));
+ fl.l_type = type;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ return fcntl(fd, F_SETLKW, &fl);
+}
+
+static int unlockFd(int fd) {
+ struct flock fl;
+ memset(&fl, 0, sizeof(fl));
+ fl.l_type = F_UNLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ return fcntl(fd, F_SETLK, &fl);
+}
+
+static int saveLhState(FILE* fp, const struct LhState* state) {
+ if (!fp) {
+ fprintf(stderr,
+ "Invalid file pointer for saving lighthouse state\n");
+ return LH_STATE_ERROR;
+ }
+
+ if (!fwrite(&state->version, sizeof(state->version), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state version\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fwrite(&state->nranks, sizeof(state->nranks), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state nranks\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fwrite(&state->magic, sizeof(state->magic), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state magic\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fwrite(&state->first_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state first_rank\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fwrite(&state->last_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state last_rank\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fwrite(&state->new_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to write lighthouse state new_rank\n");
+ return LH_STATE_ERROR;
+ }
+
+ return LH_STATE_OK;
+}
+
+static int loadLhState(FILE* fp, struct LhState* state) {
+ if (!fp) {
+ fprintf(stderr,
+ "Invalid file pointer for loading lighthouse state\n");
+ return LH_STATE_ERROR;
+ }
+
+ size_t n = fread(&state->version, sizeof(state->version), 1, fp);
+ if (n == 0) {
+ if (feof(fp)) {
+ memset(state, 0, sizeof(struct LhState));
+ return LH_STATE_EMPTY;
+ }
+ fprintf(stderr, "Failed to read lighthouse state version\n");
+ return LH_STATE_ERROR;
+ }
+
+ if (!fread(&state->nranks, sizeof(state->nranks), 1, fp)) {
+ fprintf(stderr, "Failed to read lighthouse state nranks\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fread(&state->magic, sizeof(state->magic), 1, fp)) {
+ fprintf(stderr, "Failed to read lighthouse state magic\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fread(&state->first_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to read lighthouse state first_rank\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fread(&state->last_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to read lighthouse state last_rank\n");
+ return LH_STATE_ERROR;
+ }
+ if (!fread(&state->new_rank, sizeof(struct LhRank), 1, fp)) {
+ fprintf(stderr, "Failed to read lighthouse state new_rank\n");
+ return LH_STATE_ERROR;
+ }
+
+ return LH_STATE_OK;
+}
+
+int txnWaitForVersion(const char* path, uint64_t expected_version, int timeout_ms) {
+ const int sleep_interval_us = 100 * 1000;
+ int waited_ms = 0;
+
+ while (1) {
+ struct LhTxn* lhTxn = NULL;
+ struct LhState* lhState = NULL;
+ int ret = txnBegin(path, 0, &lhTxn);
+ if (ret != LH_STATE_OK) {
+ fprintf(stderr, "lighthouse: txnBegin failed");
+ return ret;
+ }
+ ret = txnLoad(lhTxn, &lhState);
+ if (ret != LH_STATE_OK) {
+ fprintf(stderr, "lighthouse: txnLoad failed");
+ txnEnd(lhTxn);
+ return ret;
+ }
+
+ uint64_t version = lhState->version;
+
+ txnEnd(lhTxn);
+ free(lhState);
+
+ if (version == expected_version) {
+ return LH_STATE_OK;
+ }
+
+ if (timeout_ms >= 0 && waited_ms >= timeout_ms) {
+ fprintf(stderr, "lighthouse: timeout waiting for version %lu\n", expected_version);
+ return LH_STATE_ERROR;
+ }
+
+ usleep(sleep_interval_us);
+ waited_ms += sleep_interval_us / 1000;
+ }
+}
+
+int txnBegin(const char* path, int write, struct LhTxn** out) {
+ struct LhTxn* txn = (struct LhTxn*)malloc(sizeof(*txn));
+ if (!txn) {
+ return LH_STATE_ERROR;
+ }
+
+ txn->writable = write;
+ txn->fp = fopen(path, write ? "r+b" : "rb");
+ if (!txn->fp && write) {
+ txn->fp = fopen(path, "w+b");
+ }
+ if (!txn->fp) {
+ return LH_STATE_ERROR;
+ }
+
+ txn->fd = fileno(txn->fp);
+
+ if (lockFd(txn->fd, write ? F_WRLCK : F_RDLCK) < 0) {
+ fclose(txn->fp);
+ return LH_STATE_ERROR;
+ }
+
+ *out = txn;
+ return LH_STATE_OK;
+}
+
+int txnLoad(struct LhTxn* txn, struct LhState** out) {
+ rewind(txn->fp);
+
+ struct LhState* state = (struct LhState*)malloc(sizeof(struct LhState));
+ if (!state) {
+ return LH_STATE_ERROR;
+ }
+
+ if (loadLhState(txn->fp, state) == LH_STATE_ERROR) {
+ return LH_STATE_ERROR;
+ }
+
+ *out = state;
+ return LH_STATE_OK;
+}
+
+int txnSave(struct LhTxn* txn, const struct LhState* state) {
+ if (!txn->writable) {
+ errno = EPERM;
+ return LH_STATE_ERROR;
+ }
+
+ rewind(txn->fp);
+
+ if (saveLhState(txn->fp, state) < 0) {
+ return LH_STATE_ERROR;
+ }
+
+ fflush(txn->fp);
+ fsync(txn->fd);
+ return LH_STATE_OK;
+}
+
+int txnEnd(struct LhTxn* txn) {
+ int ret = LH_STATE_OK;
+
+ if (txn->fp) {
+ if (unlockFd(txn->fd) < 0) {
+ ret = LH_STATE_ERROR;
+ }
+ fclose(txn->fp);
+ }
+
+ memset(txn, 0, sizeof(*txn));
+ return ret;
+}
+
+int initialize(struct LhState* state, const union ncclSocketAddress* src_addrs, int nranks, uint64_t magic) {
+ memset(state, 0, sizeof(struct LhState));
+
+ state->version = 1;
+ state->nranks = nranks;
+ state->magic = magic;
+
+ state->first_rank.rank_id = 0;
+ memcpy(&state->first_rank.addr, &src_addrs[0], sizeof(union ncclSocketAddress));
+
+ state->last_rank.rank_id = nranks - 1;
+ memcpy(&state->last_rank.addr, &src_addrs[nranks - 1], sizeof(union ncclSocketAddress));
+
+ return LH_STATE_OK;
+}
+
+void setMagic(struct LhState* state, uint64_t magic)
+{
+ state->magic = magic;
+}
+
+int setFirstRank(struct LhState* state, const union ncclSocketAddress* firstRankNcclAddr, uint32_t rank, uint32_t nranks)
+{
+ state->first_rank.rank_id = rank;
+ state->nranks = nranks;
+ memcpy(&state->first_rank.addr, firstRankNcclAddr, sizeof(union ncclSocketAddress));
+ return LH_STATE_OK;
+}
+
+int setLastRank(struct LhState* state, const union ncclSocketAddress* lastRankNcclAddr, uint32_t rank, uint32_t nranks)
+{
+ state->last_rank.rank_id = rank;
+ state->nranks = nranks;
+ memcpy(&state->last_rank.addr, lastRankNcclAddr, sizeof(union ncclSocketAddress));
+ return LH_STATE_OK;
+}
+
+int setNewRank(struct LhState* state, const union ncclSocketAddress* newRankNcclAddr, uint32_t rank)
+{
+ state->new_rank.rank_id = rank;
+ memcpy(&state->new_rank.addr, newRankNcclAddr, sizeof(union ncclSocketAddress));
+ return LH_STATE_OK;
+}
+
+void updateLastRankAddr(struct LhState* state)
+{
+ state->last_rank = state->new_rank;
+}
+
+void updateVersion(struct LhState* state)
+{
+ state->version++;
+}
+
+void getMagic(const struct LhState* state, uint64_t* magic)
+{
+ *magic = state->magic;
+}
+
+void getVersion(const struct LhState* state, uint64_t* version)
+{
+ *version = state->version;
+}
+
+int queryNextRankAddrNew(const struct LhState* state, union ncclSocketAddress* nextAddr)
+{
+ memcpy(nextAddr, &state->first_rank.addr, sizeof(union ncclSocketAddress));
+ return LH_STATE_OK;
+}
+
+int queryNextRankAddrLast(const struct LhState* state, union ncclSocketAddress* nextAddr)
+{
+ memcpy(nextAddr, &state->new_rank.addr, sizeof(union ncclSocketAddress));
+ return LH_STATE_OK;
+}
+
+void printLhState(const struct LhState* state)
+{
+ printf("Lighthouse State:\n");
+ printf(" Version: %lu\n", state->version);
+ printf(" Nranks: %u\n", state->nranks);
+ printf(" Magic: %lu\n", state->magic);
+ printf(" First Rank ID: %u\n", state->first_rank.rank_id);
+ printf(" Address Family: %u\n", state->first_rank.addr.sin.sin_family);
+ printf(" Port: %u\n", ntohs(state->first_rank.addr.sin.sin_port));
+ printf(" Address: %x\n", state->first_rank.addr.sin.sin_addr.s_addr);
+ printf(" Last Rank ID: %u\n", state->last_rank.rank_id);
+ printf(" Address Family: %u\n", state->last_rank.addr.sin.sin_family);
+ printf(" Port: %u\n", ntohs(state->last_rank.addr.sin.sin_port));
+ printf(" Address: %x\n", state->last_rank.addr.sin.sin_addr.s_addr);
+ printf(" New Rank ID: %u\n", state->new_rank.rank_id);
+ printf(" Address Family: %u\n", state->new_rank.addr.sin.sin_family);
+ printf(" Port: %u\n", ntohs(state->new_rank.addr.sin.sin_port));
+ printf(" Address: %x\n", state->new_rank.addr.sin.sin_addr.s_addr);
+}
\ No newline at end of file
@@ -101,17 +101,17 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
goto fail;
}
- if (create) {
- *(int*)(hptr + shmSize) = refcount;
- } else {
- int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
- if (remref == 0) {
- /* the last peer has completed attachment, it should unlink the shm mem file. */
- if (unlink(shmPath) != 0) {
- INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
- }
- }
- }
+ // if (create) {
+ // *(int*)(hptr + shmSize) = refcount;
+ // } else {
+ // int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize));
+ // if (remref == 0) {
+ // /* the last peer has completed attachment, it should unlink the shm mem file. */
+ // if (unlink(shmPath) != 0) {
+ // INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
+ // }
+ // }
+ // }
if (devShmPtr) {
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
@@ -4,981 +4,985 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "socket.h"
-#include "utils.h"
-#include <stdlib.h>
-
-#include <unistd.h>
-#include <ifaddrs.h>
-#include <net/if.h>
-#include "param.h"
-#include <time.h>
-
-NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
-NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
-static void msleep(unsigned int time_msec) {
- const long c_1e6 = 1e6;
- struct timespec tv = (struct timespec){
- .tv_sec = time_msec / 1000,
- .tv_nsec = (time_msec % 1000) * c_1e6,
- };
- nanosleep(&tv, NULL);
-}
-
-static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
- int bytes = 0;
- *closed = 0;
- char* data = (char*)ptr;
- char line[SOCKET_NAME_MAXLEN+1];
- do {
- if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
- if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
- if (op == NCCL_SOCKET_RECV && bytes == 0) {
- *closed = 1;
- return ncclSuccess;
- }
- if (bytes == -1) {
- if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
- *closed = 1;
- return ncclSuccess;
- }
- if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
- WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
- ncclSocketToString(&sock->addr, line), strerror(errno));
- return ncclRemoteError;
- } else {
- bytes = 0;
- }
- }
- (*offset) += bytes;
- if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) {
- INFO(NCCL_NET, "socketProgressOpt: abort called");
- return ncclInternalError;
- }
- } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
- return ncclSuccess;
-}
-
-static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
- int closed;
- NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
- if (closed) {
- if (pclosed) {
- *pclosed = closed;
- return ncclSuccess;
- } else {
- char line[SOCKET_NAME_MAXLEN+1];
- WARN("socketProgress: Connection closed by remote peer %s",
- ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
- return ncclRemoteError;
- }
- }
- return ncclSuccess;
-}
-
-static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
- while (*offset < size)
- NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
- return ncclSuccess;
-}
-
-/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
- *
- * Output: "IPv4/IPv6 address<port>"
- */
-const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
- const struct sockaddr *saddr;
- char host[NI_MAXHOST], service[NI_MAXSERV];
- int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
- if (buf == NULL || addr == NULL) goto fail;
- saddr = &addr->sa;
- if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
- /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
- * (When not set, this will still happen in case the node's name cannot be determined.)
- */
- if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
- sprintf(buf, "%s<%s>", host, service);
- return buf;
-fail:
- if (buf)
- buf[0] = '\0';
- return buf;
-}
-
-static uint16_t socketToPort(union ncclSocketAddress *addr) {
- struct sockaddr *saddr = &addr->sa;
- return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
-}
-
-/* Allow the user to force the IPv4/IPv6 interface selection */
-static int envSocketFamily(void) {
- int family = -1; // Family selection is not forced, will use first one found
- const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
- if (env == NULL)
- return family;
-
- INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
-
- if (strcmp(env, "AF_INET") == 0)
- family = AF_INET; // IPv4
- else if (strcmp(env, "AF_INET6") == 0)
- family = AF_INET6; // IPv6
- return family;
-}
-
-static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
- int maxIfNameSize, int maxIfs, int* found) {
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
-#endif
- struct netIf userIfs[MAX_IFS];
- bool searchNot = prefixList && prefixList[0] == '^';
- if (searchNot) prefixList++;
- bool searchExact = prefixList && prefixList[0] == '=';
- if (searchExact) prefixList++;
- int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
-
- *found = 0;
- struct ifaddrs *interfaces, *interface;
- SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
- for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
- if (interface->ifa_addr == NULL) continue;
-
- /* We only support IPv4 & IPv6 */
- int family = interface->ifa_addr->sa_family;
- if (family != AF_INET && family != AF_INET6)
- continue;
-
- TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
-
- /* Allow the caller to force the socket family type */
- if (sock_family != -1 && family != sock_family)
- continue;
-
- /* We also need to skip IPv6 loopback interfaces */
- if (family == AF_INET6) {
- struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
- if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
- }
-
- // check against user specified interfaces
- if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
- continue;
- }
-
- // Check that this interface has not already been saved
- // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
- bool duplicate = false;
- for (int i = 0; i < *found; i++) {
- if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
- }
-
- if (!duplicate) {
- // Store the interface name
- strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
- // Store the IP address
- int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
- memset(addrs + *found, '\0', sizeof(*addrs));
- memcpy(addrs + *found, interface->ifa_addr, salen);
- (*found)++;
- }
- }
-
- freeifaddrs(interfaces);
- return ncclSuccess;
-}
-
-static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
- /* Check family first */
- int family = local_if.ifa_addr->sa_family;
- if (family != remote->sa.sa_family) {
- return false;
- }
-
- if (family == AF_INET) {
- struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
- struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
- struct sockaddr_in& remote_addr = remote->sin;
- struct in_addr local_subnet, remote_subnet;
- local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
- remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
- return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
- } else if (family == AF_INET6) {
- struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
- struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
- struct sockaddr_in6& remote_addr = remote->sin6;
- struct in6_addr& local_in6 = local_addr->sin6_addr;
- struct in6_addr& mask_in6 = mask->sin6_addr;
- struct in6_addr& remote_in6 = remote_addr.sin6_addr;
- bool same = true;
- int len = 16; //IPv6 address is 16 unsigned char
- for (int c = 0; c < len; c++) { //Network byte order is big-endian
- char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
- char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
- if (c1 ^ c2) {
- same = false;
- break;
- }
- }
- // At last, we need to compare scope id
- // Two Link-type addresses can have the same subnet address even though they are not in the same scope
- // For Global type, this field is 0, so a comparison wouldn't matter
- same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
- return same;
- } else {
- INFO(NCCL_NET, "Net : Unsupported address family type");
- return false;
- }
-}
-
-ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
- union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
- char line_a[SOCKET_NAME_MAXLEN+1];
-#endif
- *found = 0;
- struct ifaddrs *interfaces, *interface;
- SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
- for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
- if (interface->ifa_addr == NULL) continue;
-
- /* We only support IPv4 & IPv6 */
- int family = interface->ifa_addr->sa_family;
- if (family != AF_INET && family != AF_INET6)
- continue;
-
- // check against user specified interfaces
- if (!matchSubnet(*interface, remoteAddr)) {
- continue;
- }
-
- // Store the local IP address
- int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
- memcpy(localAddr, interface->ifa_addr, salen);
-
- // Store the interface name
- strncpy(ifName, interface->ifa_name, ifNameMaxSize);
-
- TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
- interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
- *found = 1;
- }
-
- freeifaddrs(interfaces);
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
- if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
- WARN("Net : string is null");
- return ncclInvalidArgument;
- }
-
- bool ipv6 = ip_port_pair[0] == '[';
- /* Construct the sockaddress structure */
- if (!ipv6) {
- struct netIf ni;
- // parse <ip_or_hostname>:<port> string, expect one pair
- if (parseStringList(ip_port_pair, &ni, 1) != 1) {
- WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
- return ncclInvalidArgument;
- }
-
- struct addrinfo hints, *p;
- int rv;
- memset(&hints, 0, sizeof(hints));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
-
- if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
- WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
- return ncclInvalidArgument;
- }
-
- // use the first
- if (p->ai_family == AF_INET) {
- struct sockaddr_in& sin = ua->sin;
- memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
- sin.sin_family = AF_INET; // IPv4
- //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
- sin.sin_port = htons(ni.port); // port
- } else if (p->ai_family == AF_INET6) {
- struct sockaddr_in6& sin6 = ua->sin6;
- memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
- sin6.sin6_family = AF_INET6; // IPv6
- sin6.sin6_port = htons(ni.port); // port
- sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
- sin6.sin6_scope_id = 0; // should be global scope, set to 0
- } else {
- WARN("Net : unsupported IP family");
- freeaddrinfo(p);
- return ncclInvalidArgument;
- }
-
- freeaddrinfo(p); // all done with this structure
-
- } else {
- int i, j = -1, len = strlen(ip_port_pair);
- for (i = 1; i < len; i++) {
- if (ip_port_pair[i] == '%') j = i;
- if (ip_port_pair[i] == ']') break;
- }
- if (i == len) {
- WARN("Net : No valid [IPv6]:port pair found");
- return ncclInvalidArgument;
- }
- bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
-
- char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
- memset(ip_str, '\0', sizeof(ip_str));
- memset(port_str, '\0', sizeof(port_str));
- memset(if_name, '\0', sizeof(if_name));
- strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
- strncpy(port_str, ip_port_pair+i+2, len-i-1);
- int port = atoi(port_str);
- if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
-
- struct sockaddr_in6& sin6 = ua->sin6;
- sin6.sin6_family = AF_INET6; // IPv6
- inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
- sin6.sin6_port = htons(port); // port
- sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
- sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
- int* nIfs) {
- static int shownIfName = 0;
- // Allow user to force the INET socket family selection
- int sock_family = envSocketFamily();
- // User specified interface
- const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
- *nIfs = 0;
- if (env && strlen(env) > 1) {
- INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
- // Specified by user : find or fail
- if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
- NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
- } else {
- // Try to automatically pick the right one
- // Start with IB
- NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
- // else see if we can get some hint from COMM ID
- if (*nIfs == 0) {
- const char* commId = ncclGetEnv("NCCL_COMM_ID");
- if (commId && strlen(commId) > 1) {
- INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
- // Try to find interface that is in the same subnet as the IP in comm id
- union ncclSocketAddress idAddr;
- NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
- NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
- }
- }
- // Then look for anything else (but not docker or lo)
- if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
- // Finally look for docker, then lo.
- if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
- if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
- if (sock == NULL) {
- WARN("ncclSocketListen: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (sock->fd == -1) {
- WARN("ncclSocketListen: file descriptor is -1");
- return ncclInvalidArgument;
- }
-
- if (socketToPort(&sock->addr)) {
- // Port is forced by env. Make sure we get the port.
- int opt = 1;
- SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
-#if defined(SO_REUSEPORT)
- SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#endif
- }
-
- // addr port should be 0 (Any port)
- SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
-
- /* Get the assigned Port */
- socklen_t size = sock->salen;
- SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
-
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
- TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
-#endif
-
- /* Put the socket in listen mode
- * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
- */
- SYSCHECK(listen(sock->fd, 16384), "listen");
- sock->state = ncclSocketStateReady;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
- if (sock == NULL) {
- WARN("ncclSocketGetAddr: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (sock->state != ncclSocketStateReady) return ncclInternalError;
- memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
- return ncclSuccess;
-}
-
-static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
- socklen_t socklen = sizeof(union ncclSocketAddress);
- sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
- if (sock->fd != -1) {
- sock->state = ncclSocketStateAccepted;
- } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
- errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH ||
- errno == EINTR) {
- /* per accept's man page, for linux sockets, the following errors might be already pending errors
- * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
- if (++sock->errorRetries == ncclParamRetryCnt()) {
- WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
- return ncclSystemError;
- }
- INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
- } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
- WARN("socketTryAccept: Accept failed: %s", strerror(errno));
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
-NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
-
-static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
- const int one = 1;
- /* Set socket as non-blocking if async or if we need to be able to abort */
- if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
- int flags;
- SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
- SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
- }
- SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
- // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
- int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
- if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
- if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
- return ncclSuccess;
-}
-
-static void socketResetAccept(struct ncclSocket* sock) {
- char line[SOCKET_NAME_MAXLEN+1];
- INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
- ncclSocketToString(&sock->addr, line));
- // Ignore spurious connection and accept again
- (void)close(sock->fd);
- sock->fd = -1;
- sock->state = ncclSocketStateAccepting;
- sock->finalizeCounter = 0;
-}
-
-static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
- uint64_t magic;
- enum ncclSocketType type;
- int received;
- char line[SOCKET_NAME_MAXLEN+1];
- // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
- NCCLCHECK(socketSetFlags(sock));
-
- if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
- if (sock->asyncFlag == 0) {
- received = 0;
- if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
- socketResetAccept(sock);
- return ncclSuccess;
- }
- } else {
- int closed = 0;
- received = sock->finalizeCounter;
- NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
- sock->finalizeCounter = received;
- if (received < sizeof(magic)) {
- if (closed) {
- socketResetAccept(sock);
- }
- return ncclSuccess;
- }
- memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
- }
- if (magic != sock->magic) {
- socketResetAccept(sock);
- return ncclSuccess;
- }
- }
- if (sock->asyncFlag == 0) {
- received = 0;
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
- } else {
- received = sock->finalizeCounter - sizeof(magic);
- NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
- sock->finalizeCounter = received + sizeof(magic);
- if (received < sizeof(type)) return ncclSuccess;
- memcpy(&type, sock->finalizeBuffer, sizeof(type));
- }
- if (type != sock->type) {
- WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
- sock->state = ncclSocketStateError;
- close(sock->fd);
- sock->fd = -1;
- return ncclInternalError;
- } else {
- sock->state = ncclSocketStateReady;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t socketResetFd(struct ncclSocket* sock) {
- ncclResult_t ret = ncclSuccess;
- int fd = -1;
- SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
- // if sock->fd is valid, close it and reuse its number
- if (sock->fd != -1) {
- SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
- SYSCHECKGOTO(close(fd), "close", ret, cleanup);
- } else {
- sock->fd = fd;
- }
- NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
-exit:
- return ret;
-cleanup:
- // cleanup fd, leave sock->fd untouched
- if (fd != -1) {
- (void)close(fd);
- }
- goto exit;
-}
-
-static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
- char line[SOCKET_NAME_MAXLEN+1];
- if (errCode == 0) {
- sock->state = ncclSocketStateConnected;
- } else if (errCode == EINPROGRESS) {
- sock->state = ncclSocketStateConnectPolling;
- } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
- errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
- if (sock->customRetry == 0) {
- if (sock->errorRetries++ == ncclParamRetryCnt()) {
- sock->state = ncclSocketStateError;
- WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
- funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
- return ncclRemoteError;
- }
- unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
- INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
- funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
- sock->errorRetries, ncclParamRetryCnt(), sleepTime);
- msleep(sleepTime);
- }
- NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
- sock->state = ncclSocketStateConnecting;
- } else {
- sock->state = ncclSocketStateError;
- WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
- /* blocking/non-blocking connect() is determined by asyncFlag. */
- int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
- return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
-}
-
-static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
- struct pollfd pfd;
- int timeout = 1, ret;
- socklen_t rlen = sizeof(int);
- char line[SOCKET_NAME_MAXLEN+1];
-
- memset(&pfd, 0, sizeof(struct pollfd));
- pfd.fd = sock->fd;
- pfd.events = POLLOUT;
- ret = poll(&pfd, 1, timeout);
-
- if (ret == 0 || (ret < 0 && errno == EINTR)) {
- return ncclSuccess;
- } else if (ret < 0) {
- WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
- return ncclSystemError;
- }
-
- /* check socket status */
- SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
- return socketConnectCheck(sock, ret, __func__);
-}
-
-ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
- if (sock == NULL) {
- WARN("ncclSocketPollConnect: pass NULL socket");
- return ncclInvalidArgument;
- }
- NCCLCHECK(socketPollConnect(sock));
- return ncclSuccess;
-}
-
-static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
- int sent;
- if (sock->asyncFlag == 0) {
- sent = 0;
- NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
- sent = 0;
- NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
- } else {
- if (sock->finalizeCounter < sizeof(sock->magic)) {
- sent = sock->finalizeCounter;
- NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
- sock->finalizeCounter = sent;
- if (sent < sizeof(sock->magic)) return ncclSuccess;
- }
- sent = sock->finalizeCounter - sizeof(sock->magic);
- NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
- sock->finalizeCounter = sent + sizeof(sock->magic);
- if (sent < sizeof(sock->type)) return ncclSuccess;
- }
- sock->state = ncclSocketStateReady;
- return ncclSuccess;
-}
-
-static ncclResult_t socketProgressState(struct ncclSocket* sock) {
- if (sock->state == ncclSocketStateAccepting) {
- NCCLCHECK(socketTryAccept(sock));
- }
- if (sock->state == ncclSocketStateAccepted) {
- NCCLCHECK(socketFinalizeAccept(sock));
- }
- if (sock->state == ncclSocketStateConnecting) {
- NCCLCHECK(socketStartConnect(sock));
- }
- if (sock->state == ncclSocketStateConnectPolling) {
- NCCLCHECK(socketPollConnect(sock));
- }
- if (sock->state == ncclSocketStateConnected) {
- NCCLCHECK(socketFinalizeConnect(sock));
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
- if (sock == NULL) {
- *running = 0;
- return ncclSuccess;
- }
- if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
- WARN("ncclSocketReady: unexpected socket state %d", sock->state);
- return ncclRemoteError;
- }
- *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
- if (*running == 0) {
- NCCLCHECK(socketProgressState(sock));
- *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
-#endif
-
- if (sock == NULL) {
- WARN("ncclSocketConnect: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (sock->fd == -1) {
- WARN("ncclSocketConnect: file descriptor is -1");
- return ncclInvalidArgument;
- }
-
- if (sock->state != ncclSocketStateInitialized) {
- WARN("ncclSocketConnect: wrong socket state %d", sock->state);
- if (sock->state == ncclSocketStateError) return ncclRemoteError;
- return ncclInternalError;
- }
- TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
-
- sock->state = ncclSocketStateConnecting;
- sock->finalizeCounter = 0;
- do {
- NCCLCHECK(socketProgressState(sock));
- } while (sock->asyncFlag == 0 &&
- (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
- (sock->state == ncclSocketStateConnecting ||
- sock->state == ncclSocketStateConnectPolling ||
- sock->state == ncclSocketStateConnected));
-
- if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
-
- switch (sock->state) {
- case ncclSocketStateConnecting:
- case ncclSocketStateConnectPolling:
- case ncclSocketStateConnected:
- case ncclSocketStateReady:
- return ncclSuccess;
- case ncclSocketStateError:
- return ncclSystemError;
- default:
- WARN("ncclSocketConnect: wrong socket state %d", sock->state);
- return ncclInternalError;
- }
-}
-
-ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
- ncclResult_t ret = ncclSuccess;
-
- if (listenSock == NULL || sock == NULL) {
- WARN("ncclSocketAccept: pass NULL socket");
- ret = ncclInvalidArgument;
- goto exit;
- }
- if (listenSock->state != ncclSocketStateReady) {
- WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
- if (listenSock->state == ncclSocketStateError)
- ret = ncclSystemError;
- else
- ret = ncclInternalError;
- goto exit;
- }
-
- if (sock->acceptFd == -1) {
- memcpy(sock, listenSock, sizeof(struct ncclSocket));
- sock->acceptFd = listenSock->fd;
- sock->state = ncclSocketStateAccepting;
- sock->finalizeCounter = 0;
- }
-
- do {
- NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
- } while (sock->asyncFlag == 0 &&
- (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
- (sock->state == ncclSocketStateAccepting ||
- sock->state == ncclSocketStateAccepted));
-
- if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
-
- switch (sock->state) {
- case ncclSocketStateAccepting:
- case ncclSocketStateAccepted:
- case ncclSocketStateReady:
- ret = ncclSuccess;
- break;
- case ncclSocketStateError:
- ret = ncclSystemError;
- break;
- default:
- WARN("ncclSocketAccept: wrong socket state %d", sock->state);
- ret = ncclInternalError;
- break;
- }
-
-exit:
- return ret;
-}
-
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
- ncclResult_t ret = ncclSuccess;
-
- if (sock == NULL) goto exit;
- sock->errorRetries = 0;
- sock->abortFlag = abortFlag;
- sock->asyncFlag = asyncFlag;
- sock->state = ncclSocketStateInitialized;
- sock->magic = magic;
- sock->type = type;
- sock->fd = -1;
- sock->acceptFd = -1;
- sock->customRetry = customRetry;
-
- if (addr) {
- /* IPv4/IPv6 support */
- int family;
- memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
- family = sock->addr.sa.sa_family;
- if (family != AF_INET && family != AF_INET6) {
- char line[SOCKET_NAME_MAXLEN+1];
- WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
- ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
- ret = ncclInternalError;
- goto exit;
- }
- sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
- // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
- NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
- } else {
- memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
- }
-exit:
- return ret;
-fail:
- if (sock->fd != -1) {
- close(sock->fd);
- sock->fd = -1;
- }
- goto exit;
-}
-
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
- if (sock == NULL) {
- WARN("ncclSocketProgress: pass NULL socket");
- return ncclInvalidArgument;
- }
- NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
- if (sock == NULL) {
- WARN("ncclSocketWait: pass NULL socket");
- return ncclInvalidArgument;
- }
- NCCLCHECK(socketWait(op, sock, ptr, size, offset));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
- int offset = 0;
- if (sock == NULL) {
- WARN("ncclSocketSend: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (sock->state != ncclSocketStateReady) {
- WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
- return ncclInternalError;
- }
- NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
- int offset = 0;
- if (sock == NULL) {
- WARN("ncclSocketRecv: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
- WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
- return ncclInternalError;
- }
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
- int sendOffset = 0, recvOffset = 0;
- if (sendSock == NULL || recvSock == NULL) {
- WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
- return ncclInternalError;
- }
- if (sendSock->state != ncclSocketStateReady ||
- (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
- WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
- return ncclInternalError;
- }
- while (sendOffset < sendSize || recvOffset < recvSize) {
- if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
- if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
- }
- return ncclSuccess;
-}
-
-
-// Receive or detect connection closed
-ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
- int offset = 0;
- if (sock == NULL) {
- WARN("ncclSocketTryRecv: pass NULL socket");
- return ncclInvalidArgument;
- }
- *closed = 0;
- // Block until connection closes or nbytes received
- if (blocking) {
- while (offset < size) {
- NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
- if (*closed) return ncclSuccess;
- }
- } else {
- NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
- if (*closed) return ncclSuccess;
-
- // If any bytes were received, block waiting for the rest
- if (offset > 0) {
- while (offset < size) {
- NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
- if (*closed) return ncclSuccess;
- }
- // No bytes were received, return ncclInProgress
- } else {
- return ncclInProgress;
- }
- }
- return ncclSuccess;
-}
-
-// Make it possible to close just one part of a socket.
-ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
- if (sock != NULL) {
- if (sock->fd >= 0) {
- SYSCHECK(shutdown(sock->fd, how), "shutdown");
- }
- sock->state = ncclSocketStateTerminating;
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
- if (sock != NULL) {
- if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
- if (wait) {
- char data;
- int closed = 0;
- do {
- int offset = 0;
- if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
- } while (closed == 0);
- }
- /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
- * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
- * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
- * connection close here. */
- (void)shutdown(sock->fd, SHUT_RDWR);
- (void)close(sock->fd);
- }
- sock->state = ncclSocketStateClosed;
- sock->fd = -1;
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
- if (sock == NULL) {
- WARN("ncclSocketGetFd: pass NULL socket");
- return ncclInvalidArgument;
- }
- if (fd) *fd = sock->fd;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
- if (sock == NULL) {
- WARN("ncclSocketGetFd: pass NULL socket");
- return ncclInvalidArgument;
- }
- sock->fd = fd;
- return ncclSuccess;
-}
+ #include "socket.h"
+ #include "utils.h"
+ #include <stdlib.h>
+
+ #include <unistd.h>
+ #include <ifaddrs.h>
+ #include <net/if.h>
+ #include "param.h"
+ #include <time.h>
+
+ NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
+ NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
+ static void msleep(unsigned int time_msec) {
+ const long c_1e6 = 1e6;
+ struct timespec tv = (struct timespec){
+ .tv_sec = time_msec / 1000,
+ .tv_nsec = (time_msec % 1000) * c_1e6,
+ };
+ nanosleep(&tv, NULL);
+ }
+
+ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+ int bytes = 0;
+ *closed = 0;
+ char* data = (char*)ptr;
+ char line[SOCKET_NAME_MAXLEN+1];
+ do {
+ if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+ if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
+ if (op == NCCL_SOCKET_RECV && bytes == 0) {
+ *closed = 1;
+ return ncclSuccess;
+ }
+ if (bytes == -1) {
+ if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
+ *closed = 1;
+ return ncclSuccess;
+ }
+ if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+ WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
+ ncclSocketToString(&sock->addr, line), strerror(errno));
+ return ncclRemoteError;
+ } else {
+ bytes = 0;
+ }
+ }
+ (*offset) += bytes;
+ if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) {
+ INFO(NCCL_NET, "socketProgressOpt: abort called");
+ return ncclInternalError;
+ }
+ } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
+ int closed;
+ NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
+ if (closed) {
+ if (pclosed) {
+ *pclosed = closed;
+ return ncclSuccess;
+ } else {
+ char line[SOCKET_NAME_MAXLEN+1];
+ WARN("socketProgress: Connection closed by remote peer %s",
+ ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
+ return ncclRemoteError;
+ }
+ }
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ while (*offset < size)
+ NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+ return ncclSuccess;
+ }
+
+ /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+ const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+ const struct sockaddr *saddr;
+ char host[NI_MAXHOST], service[NI_MAXSERV];
+ int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
+ if (buf == NULL || addr == NULL) goto fail;
+ saddr = &addr->sa;
+ if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
+ /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
+ * (When not set, this will still happen in case the node's name cannot be determined.)
+ */
+ if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
+ sprintf(buf, "%s<%s>", host, service);
+ return buf;
+ fail:
+ if (buf)
+ buf[0] = '\0';
+ return buf;
+ }
+
+ static uint16_t socketToPort(union ncclSocketAddress *addr) {
+ struct sockaddr *saddr = &addr->sa;
+ return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
+ }
+
+ /* Allow the user to force the IPv4/IPv6 interface selection */
+ static int envSocketFamily(void) {
+ int family = -1; // Family selection is not forced, will use first one found
+ const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
+ if (env == NULL)
+ return family;
+
+ INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+
+ if (strcmp(env, "AF_INET") == 0)
+ family = AF_INET; // IPv4
+ else if (strcmp(env, "AF_INET6") == 0)
+ family = AF_INET6; // IPv6
+ return family;
+ }
+
+ static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
+ int maxIfNameSize, int maxIfs, int* found) {
+ #ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+ #endif
+ struct netIf userIfs[MAX_IFS];
+ bool searchNot = prefixList && prefixList[0] == '^';
+ if (searchNot) prefixList++;
+ bool searchExact = prefixList && prefixList[0] == '=';
+ if (searchExact) prefixList++;
+ int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+
+ *found = 0;
+ struct ifaddrs *interfaces, *interface;
+ SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+ for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
+ if (interface->ifa_addr == NULL) continue;
+
+ /* We only support IPv4 & IPv6 */
+ int family = interface->ifa_addr->sa_family;
+ if (family != AF_INET && family != AF_INET6)
+ continue;
+
+ TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
+
+ /* Allow the caller to force the socket family type */
+ if (sock_family != -1 && family != sock_family)
+ continue;
+
+ /* We also need to skip IPv6 loopback interfaces */
+ if (family == AF_INET6) {
+ struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+ if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+ }
+
+ // check against user specified interfaces
+ if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+ continue;
+ }
+
+ // Check that this interface has not already been saved
+ // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+ bool duplicate = false;
+ for (int i = 0; i < *found; i++) {
+ if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+ }
+
+ if (!duplicate) {
+ // Store the interface name
+ strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+ // Store the IP address
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ memset(addrs + *found, '\0', sizeof(*addrs));
+ memcpy(addrs + *found, interface->ifa_addr, salen);
+ (*found)++;
+ }
+ }
+
+ freeifaddrs(interfaces);
+ return ncclSuccess;
+ }
+
+ static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
+ /* Check family first */
+ int family = local_if.ifa_addr->sa_family;
+ if (family != remote->sa.sa_family) {
+ return false;
+ }
+
+ if (family == AF_INET) {
+ struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+ struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+ struct sockaddr_in& remote_addr = remote->sin;
+ struct in_addr local_subnet, remote_subnet;
+ local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+ remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+ return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+ } else if (family == AF_INET6) {
+ struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+ struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+ struct sockaddr_in6& remote_addr = remote->sin6;
+ struct in6_addr& local_in6 = local_addr->sin6_addr;
+ struct in6_addr& mask_in6 = mask->sin6_addr;
+ struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+ bool same = true;
+ int len = 16; //IPv6 address is 16 unsigned char
+ for (int c = 0; c < len; c++) { //Network byte order is big-endian
+ char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+ char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+ if (c1 ^ c2) {
+ same = false;
+ break;
+ }
+ }
+ // At last, we need to compare scope id
+ // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+ // For Global type, this field is 0, so a comparison wouldn't matter
+ same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+ return same;
+ } else {
+ INFO(NCCL_NET, "Net : Unsupported address family type");
+ return false;
+ }
+ }
+
+ ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
+ union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
+ #ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+ char line_a[SOCKET_NAME_MAXLEN+1];
+ #endif
+ *found = 0;
+ struct ifaddrs *interfaces, *interface;
+ SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+ for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
+ if (interface->ifa_addr == NULL) continue;
+
+ /* We only support IPv4 & IPv6 */
+ int family = interface->ifa_addr->sa_family;
+ if (family != AF_INET && family != AF_INET6)
+ continue;
+
+ // check against user specified interfaces
+ if (!matchSubnet(*interface, remoteAddr)) {
+ continue;
+ }
+
+ // Store the local IP address
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ memcpy(localAddr, interface->ifa_addr, salen);
+
+ // Store the interface name
+ strncpy(ifName, interface->ifa_name, ifNameMaxSize);
+
+ TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
+ interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
+ *found = 1;
+ }
+
+ freeifaddrs(interfaces);
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+ if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+ WARN("Net : string is null");
+ return ncclInvalidArgument;
+ }
+
+ bool ipv6 = ip_port_pair[0] == '[';
+ /* Construct the sockaddress structure */
+ if (!ipv6) {
+ struct netIf ni;
+ // parse <ip_or_hostname>:<port> string, expect one pair
+ if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+ WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+ return ncclInvalidArgument;
+ }
+
+ struct addrinfo hints, *p;
+ int rv;
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+
+ if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+ WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+ return ncclInvalidArgument;
+ }
+
+ // use the first
+ if (p->ai_family == AF_INET) {
+ struct sockaddr_in& sin = ua->sin;
+ memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+ sin.sin_family = AF_INET; // IPv4
+ //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
+ sin.sin_port = htons(ni.port); // port
+ } else if (p->ai_family == AF_INET6) {
+ struct sockaddr_in6& sin6 = ua->sin6;
+ memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+ sin6.sin6_family = AF_INET6; // IPv6
+ sin6.sin6_port = htons(ni.port); // port
+ sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
+ sin6.sin6_scope_id = 0; // should be global scope, set to 0
+ } else {
+ WARN("Net : unsupported IP family");
+ freeaddrinfo(p);
+ return ncclInvalidArgument;
+ }
+
+ freeaddrinfo(p); // all done with this structure
+
+ } else {
+ int i, j = -1, len = strlen(ip_port_pair);
+ for (i = 1; i < len; i++) {
+ if (ip_port_pair[i] == '%') j = i;
+ if (ip_port_pair[i] == ']') break;
+ }
+ if (i == len) {
+ WARN("Net : No valid [IPv6]:port pair found");
+ return ncclInvalidArgument;
+ }
+ bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
+
+ char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+ memset(ip_str, '\0', sizeof(ip_str));
+ memset(port_str, '\0', sizeof(port_str));
+ memset(if_name, '\0', sizeof(if_name));
+ strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+ strncpy(port_str, ip_port_pair+i+2, len-i-1);
+ int port = atoi(port_str);
+ if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+ struct sockaddr_in6& sin6 = ua->sin6;
+ sin6.sin6_family = AF_INET6; // IPv6
+ inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
+ sin6.sin6_port = htons(port); // port
+ sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
+ sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
+ int* nIfs) {
+ static int shownIfName = 0;
+ // Allow user to force the INET socket family selection
+ int sock_family = envSocketFamily();
+ // User specified interface
+ const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
+ *nIfs = 0;
+ if (env && strlen(env) > 1) {
+ INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
+ // Specified by user : find or fail
+ if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
+ NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+ } else {
+ // Try to automatically pick the right one
+ // Start with IB
+ NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+ // else see if we can get some hint from COMM ID
+ if (*nIfs == 0) {
+ const char* commId = ncclGetEnv("NCCL_COMM_ID");
+ if (commId && strlen(commId) > 1) {
+ INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+ // Try to find interface that is in the same subnet as the IP in comm id
+ union ncclSocketAddress idAddr;
+ NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
+ NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
+ }
+ }
+ // Then look for anything else (but not docker or lo)
+ if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+ // Finally look for docker, then lo.
+ if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+ if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
+ if (sock == NULL) {
+ WARN("ncclSocketListen: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (sock->fd == -1) {
+ WARN("ncclSocketListen: file descriptor is -1");
+ return ncclInvalidArgument;
+ }
+
+ if (socketToPort(&sock->addr)) {
+ // Port is forced by env. Make sure we get the port.
+ int opt = 1;
+ SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+ #if defined(SO_REUSEPORT)
+ SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+ #endif
+ }
+
+ // addr port should be 0 (Any port)
+ SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
+
+ /* Get the assigned Port */
+ socklen_t size = sock->salen;
+ SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
+
+ #ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+ TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
+ #endif
+
+ /* Put the socket in listen mode
+ * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+ */
+ SYSCHECK(listen(sock->fd, 16384), "listen");
+ sock->state = ncclSocketStateReady;
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
+ if (sock == NULL) {
+ WARN("ncclSocketGetAddr: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (sock->state != ncclSocketStateReady) return ncclInternalError;
+ memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
+ socklen_t socklen = sizeof(union ncclSocketAddress);
+ sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
+ if (sock->fd != -1) {
+ sock->state = ncclSocketStateAccepted;
+ } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
+ errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH ||
+ errno == EINTR) {
+ /* per accept's man page, for linux sockets, the following errors might be already pending errors
+ * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
+ if (++sock->errorRetries == ncclParamRetryCnt()) {
+ WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
+ return ncclSystemError;
+ }
+ INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
+ } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
+ WARN("socketTryAccept: Accept failed: %s", strerror(errno));
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+ }
+
+ NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
+ NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
+
+ static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
+ const int one = 1;
+ /* Set socket as non-blocking if async or if we need to be able to abort */
+ if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+ int flags;
+ SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
+ SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+ }
+ SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
+ // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
+ int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
+ if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
+ if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
+ return ncclSuccess;
+ }
+
+ static void socketResetAccept(struct ncclSocket* sock) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
+ ncclSocketToString(&sock->addr, line));
+ // Ignore spurious connection and accept again
+ (void)close(sock->fd);
+ sock->fd = -1;
+ sock->state = ncclSocketStateAccepting;
+ sock->finalizeCounter = 0;
+ }
+
+ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
+ uint64_t magic;
+ enum ncclSocketType type;
+ int received;
+ char line[SOCKET_NAME_MAXLEN+1];
+ // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
+ NCCLCHECK(socketSetFlags(sock));
+
+ if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
+ if (sock->asyncFlag == 0) {
+ received = 0;
+ if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
+ INFO(NCCL_INIT,"magic failed 1");
+ socketResetAccept(sock);
+ return ncclSuccess;
+ }
+ } else {
+ int closed = 0;
+ received = sock->finalizeCounter;
+ NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
+ sock->finalizeCounter = received;
+ if (received < sizeof(magic)) {
+ if (closed) {
+ INFO(NCCL_INIT,"magic failed 2");
+ socketResetAccept(sock);
+ }
+ return ncclSuccess;
+ }
+ memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
+ }
+ if (magic != sock->magic) {
+ INFO(NCCL_INIT,"magic failed 3 %lu != %lu", magic, sock->magic);
+ // socketResetAccept(sock);
+ // return ncclSuccess;
+ sock->magic = magic; // For debug purposes, accept any magic
+ }
+ }
+ if (sock->asyncFlag == 0) {
+ received = 0;
+ NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+ } else {
+ received = sock->finalizeCounter - sizeof(magic);
+ NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
+ sock->finalizeCounter = received + sizeof(magic);
+ if (received < sizeof(type)) return ncclSuccess;
+ memcpy(&type, sock->finalizeBuffer, sizeof(type));
+ }
+ if (type != sock->type) {
+ WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
+ sock->state = ncclSocketStateError;
+ close(sock->fd);
+ sock->fd = -1;
+ return ncclInternalError;
+ } else {
+ sock->state = ncclSocketStateReady;
+ }
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketResetFd(struct ncclSocket* sock) {
+ ncclResult_t ret = ncclSuccess;
+ int fd = -1;
+ SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
+ // if sock->fd is valid, close it and reuse its number
+ if (sock->fd != -1) {
+ SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
+ SYSCHECKGOTO(close(fd), "close", ret, cleanup);
+ } else {
+ sock->fd = fd;
+ }
+ NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
+ exit:
+ return ret;
+ cleanup:
+ // cleanup fd, leave sock->fd untouched
+ if (fd != -1) {
+ (void)close(fd);
+ }
+ goto exit;
+ }
+
+ static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ if (errCode == 0) {
+ sock->state = ncclSocketStateConnected;
+ } else if (errCode == EINPROGRESS) {
+ sock->state = ncclSocketStateConnectPolling;
+ } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
+ errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+ if (sock->customRetry == 0) {
+ if (sock->errorRetries++ == ncclParamRetryCnt()) {
+ sock->state = ncclSocketStateError;
+ WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
+ funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
+ return ncclRemoteError;
+ }
+ unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
+ INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
+ funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
+ sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+ msleep(sleepTime);
+ }
+ NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
+ sock->state = ncclSocketStateConnecting;
+ } else {
+ sock->state = ncclSocketStateError;
+ WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+ /* blocking/non-blocking connect() is determined by asyncFlag. */
+ int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+ return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
+ }
+
+ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
+ struct pollfd pfd;
+ int timeout = 1, ret;
+ socklen_t rlen = sizeof(int);
+ char line[SOCKET_NAME_MAXLEN+1];
+
+ memset(&pfd, 0, sizeof(struct pollfd));
+ pfd.fd = sock->fd;
+ pfd.events = POLLOUT;
+ ret = poll(&pfd, 1, timeout);
+
+ if (ret == 0 || (ret < 0 && errno == EINTR)) {
+ return ncclSuccess;
+ } else if (ret < 0) {
+ WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+ return ncclSystemError;
+ }
+
+ /* check socket status */
+ SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+ return socketConnectCheck(sock, ret, __func__);
+ }
+
+ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
+ if (sock == NULL) {
+ WARN("ncclSocketPollConnect: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ NCCLCHECK(socketPollConnect(sock));
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
+ int sent;
+ if (sock->asyncFlag == 0) {
+ sent = 0;
+ NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+ sent = 0;
+ NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+ } else {
+ if (sock->finalizeCounter < sizeof(sock->magic)) {
+ sent = sock->finalizeCounter;
+ NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+ sock->finalizeCounter = sent;
+ if (sent < sizeof(sock->magic)) return ncclSuccess;
+ }
+ sent = sock->finalizeCounter - sizeof(sock->magic);
+ NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+ sock->finalizeCounter = sent + sizeof(sock->magic);
+ if (sent < sizeof(sock->type)) return ncclSuccess;
+ }
+ sock->state = ncclSocketStateReady;
+ return ncclSuccess;
+ }
+
+ static ncclResult_t socketProgressState(struct ncclSocket* sock) {
+ if (sock->state == ncclSocketStateAccepting) {
+ NCCLCHECK(socketTryAccept(sock));
+ }
+ if (sock->state == ncclSocketStateAccepted) {
+ NCCLCHECK(socketFinalizeAccept(sock));
+ }
+ if (sock->state == ncclSocketStateConnecting) {
+ NCCLCHECK(socketStartConnect(sock));
+ }
+ if (sock->state == ncclSocketStateConnectPolling) {
+ NCCLCHECK(socketPollConnect(sock));
+ }
+ if (sock->state == ncclSocketStateConnected) {
+ NCCLCHECK(socketFinalizeConnect(sock));
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
+ if (sock == NULL) {
+ *running = 0;
+ return ncclSuccess;
+ }
+ if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
+ WARN("ncclSocketReady: unexpected socket state %d", sock->state);
+ return ncclRemoteError;
+ }
+ *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+ if (*running == 0) {
+ NCCLCHECK(socketProgressState(sock));
+ *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+ #ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+ #endif
+
+ if (sock == NULL) {
+ WARN("ncclSocketConnect: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (sock->fd == -1) {
+ WARN("ncclSocketConnect: file descriptor is -1");
+ return ncclInvalidArgument;
+ }
+
+ if (sock->state != ncclSocketStateInitialized) {
+ WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+ if (sock->state == ncclSocketStateError) return ncclRemoteError;
+ return ncclInternalError;
+ }
+ TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+
+ sock->state = ncclSocketStateConnecting;
+ sock->finalizeCounter = 0;
+ do {
+ NCCLCHECK(socketProgressState(sock));
+ } while (sock->asyncFlag == 0 &&
+ (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
+ (sock->state == ncclSocketStateConnecting ||
+ sock->state == ncclSocketStateConnectPolling ||
+ sock->state == ncclSocketStateConnected));
+
+ if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
+
+ switch (sock->state) {
+ case ncclSocketStateConnecting:
+ case ncclSocketStateConnectPolling:
+ case ncclSocketStateConnected:
+ case ncclSocketStateReady:
+ return ncclSuccess;
+ case ncclSocketStateError:
+ return ncclSystemError;
+ default:
+ WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+ return ncclInternalError;
+ }
+ }
+
+ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
+ ncclResult_t ret = ncclSuccess;
+
+ if (listenSock == NULL || sock == NULL) {
+ WARN("ncclSocketAccept: pass NULL socket");
+ ret = ncclInvalidArgument;
+ goto exit;
+ }
+ if (listenSock->state != ncclSocketStateReady) {
+ WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
+ if (listenSock->state == ncclSocketStateError)
+ ret = ncclSystemError;
+ else
+ ret = ncclInternalError;
+ goto exit;
+ }
+
+ if (sock->acceptFd == -1) {
+ memcpy(sock, listenSock, sizeof(struct ncclSocket));
+ sock->acceptFd = listenSock->fd;
+ sock->state = ncclSocketStateAccepting;
+ sock->finalizeCounter = 0;
+ }
+
+ do {
+ NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
+ } while (sock->asyncFlag == 0 &&
+ (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
+ (sock->state == ncclSocketStateAccepting ||
+ sock->state == ncclSocketStateAccepted));
+
+ if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
+
+ switch (sock->state) {
+ case ncclSocketStateAccepting:
+ case ncclSocketStateAccepted:
+ case ncclSocketStateReady:
+ ret = ncclSuccess;
+ break;
+ case ncclSocketStateError:
+ ret = ncclSystemError;
+ break;
+ default:
+ WARN("ncclSocketAccept: wrong socket state %d", sock->state);
+ ret = ncclInternalError;
+ break;
+ }
+
+ exit:
+ return ret;
+ }
+
+ ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
+ ncclResult_t ret = ncclSuccess;
+
+ if (sock == NULL) goto exit;
+ sock->errorRetries = 0;
+ sock->abortFlag = abortFlag;
+ sock->asyncFlag = asyncFlag;
+ sock->state = ncclSocketStateInitialized;
+ sock->magic = magic;
+ sock->type = type;
+ sock->fd = -1;
+ sock->acceptFd = -1;
+ sock->customRetry = customRetry;
+
+ if (addr) {
+ /* IPv4/IPv6 support */
+ int family;
+ memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+ family = sock->addr.sa.sa_family;
+ if (family != AF_INET && family != AF_INET6) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+ ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+ ret = ncclInternalError;
+ goto exit;
+ }
+ sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
+ NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
+ } else {
+ memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
+ }
+ exit:
+ return ret;
+ fail:
+ if (sock->fd != -1) {
+ close(sock->fd);
+ sock->fd = -1;
+ }
+ goto exit;
+ }
+
+ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
+ if (sock == NULL) {
+ WARN("ncclSocketProgress: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ if (sock == NULL) {
+ WARN("ncclSocketWait: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ NCCLCHECK(socketWait(op, sock, ptr, size, offset));
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
+ int offset = 0;
+ if (sock == NULL) {
+ WARN("ncclSocketSend: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (sock->state != ncclSocketStateReady) {
+ WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
+ return ncclInternalError;
+ }
+ NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
+ int offset = 0;
+ if (sock == NULL) {
+ WARN("ncclSocketRecv: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
+ WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
+ return ncclInternalError;
+ }
+ NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
+ int sendOffset = 0, recvOffset = 0;
+ if (sendSock == NULL || recvSock == NULL) {
+ WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
+ return ncclInternalError;
+ }
+ if (sendSock->state != ncclSocketStateReady ||
+ (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
+ WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
+ return ncclInternalError;
+ }
+ while (sendOffset < sendSize || recvOffset < recvSize) {
+ if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
+ if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
+ }
+ return ncclSuccess;
+ }
+
+ // Receive or detect connection closed
+ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
+ int offset = 0;
+ if (sock == NULL) {
+ WARN("ncclSocketTryRecv: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ *closed = 0;
+ // Block until connection closes or nbytes received
+ if (blocking) {
+ while (offset < size) {
+ NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+ if (*closed) return ncclSuccess;
+ }
+ } else {
+ NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+ if (*closed) return ncclSuccess;
+
+ // If any bytes were received, block waiting for the rest
+ if (offset > 0) {
+ while (offset < size) {
+ NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+ if (*closed) return ncclSuccess;
+ }
+ // No bytes were received, return ncclInProgress
+ } else {
+ return ncclInProgress;
+ }
+ }
+ return ncclSuccess;
+ }
+
+ // Make it possible to close just one part of a socket.
+ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
+ if (sock != NULL) {
+ if (sock->fd >= 0) {
+ SYSCHECK(shutdown(sock->fd, how), "shutdown");
+ }
+ sock->state = ncclSocketStateTerminating;
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
+ if (sock != NULL) {
+ if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+ if (wait) {
+ char data;
+ int closed = 0;
+ do {
+ int offset = 0;
+ if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+ } while (closed == 0);
+ }
+ /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
+ * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
+ * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
+ * connection close here. */
+ (void)shutdown(sock->fd, SHUT_RDWR);
+ (void)close(sock->fd);
+ }
+ sock->state = ncclSocketStateClosed;
+ sock->fd = -1;
+ }
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
+ if (sock == NULL) {
+ WARN("ncclSocketGetFd: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ if (fd) *fd = sock->fd;
+ return ncclSuccess;
+ }
+
+ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
+ if (sock == NULL) {
+ WARN("ncclSocketGetFd: pass NULL socket");
+ return ncclInvalidArgument;
+ }
+ sock->fd = fd;
+ return ncclSuccess;
+ }
+
\ No newline at end of file
@@ -1,8 +1,8 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
+* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+*
+* See LICENSE.txt for license information
+************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
@@ -37,16 +37,27 @@ typedef struct ncclWindow* ncclWindow_t;
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+#define NCCL_NEW_RANK_INFO_BYTES (10 * 1024 * 1024)
+typedef struct alignas(16) {
+char* internal;
+} ncclNewRankInfo;
+
+#define NCCL_COMM_INFO_BYTES (10 * 1024 * 1024)
+typedef struct alignas(16) {
+ //char internal[NCCL_COMM_INFO_BYTES];
+ char* internal;
+} ncclCommInfo;
+
/* Error type */
typedef enum { ncclSuccess = 0,
- ncclUnhandledCudaError = 1,
- ncclSystemError = 2,
- ncclInternalError = 3,
- ncclInvalidArgument = 4,
- ncclInvalidUsage = 5,
- ncclRemoteError = 6,
- ncclInProgress = 7,
- ncclNumResults = 8 } ncclResult_t;
+ ncclUnhandledCudaError = 1,
+ ncclSystemError = 2,
+ ncclInternalError = 3,
+ ncclInvalidArgument = 4,
+ ncclInvalidUsage = 5,
+ ncclRemoteError = 6,
+ ncclInProgress = 7,
+ ncclNumResults = 8 } ncclResult_t;
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
@@ -66,45 +77,45 @@ typedef enum { ncclSuccess = 0,
#define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
/* Communicator configuration. Users can assign value to attributes to specify the
- * behavior of a communicator. */
+* behavior of a communicator. */
typedef struct ncclConfig_v22700 {
- /* attributes that users should never touch. */
- size_t size;
- unsigned int magic;
- unsigned int version;
- /* attributes that users are able to customize. */
- int blocking;
- int cgaClusterSize;
- int minCTAs;
- int maxCTAs;
- const char *netName;
- int splitShare;
- int trafficClass;
- const char *commName;
- int collnetEnable;
- int CTAPolicy;
- int shrinkShare;
- int nvlsCTAs;
+/* attributes that users should never touch. */
+size_t size;
+unsigned int magic;
+unsigned int version;
+/* attributes that users are able to customize. */
+int blocking;
+int cgaClusterSize;
+int minCTAs;
+int maxCTAs;
+const char *netName;
+int splitShare;
+int trafficClass;
+const char *commName;
+int collnetEnable;
+int CTAPolicy;
+int shrinkShare;
+int nvlsCTAs;
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
- * Not initialized config will result in NCCL error. */
+* Not initialized config will result in NCCL error. */
#define NCCL_CONFIG_INITIALIZER { \
- sizeof(ncclConfig_t), /* size */ \
- 0xcafebeef, /* magic */ \
- NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
- NCCL_CONFIG_UNDEF_INT, /* blocking */ \
- NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
- NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
- NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
- NCCL_CONFIG_UNDEF_PTR, /* netName */ \
- NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
- NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
- NCCL_CONFIG_UNDEF_PTR, /* commName */ \
- NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
- NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
- NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
- NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
+sizeof(ncclConfig_t), /* size */ \
+0xcafebeef, /* magic */ \
+NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
+NCCL_CONFIG_UNDEF_INT, /* blocking */ \
+NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
+NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
+NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
+NCCL_CONFIG_UNDEF_PTR, /* netName */ \
+NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
+NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
+NCCL_CONFIG_UNDEF_PTR, /* commName */ \
+NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
+NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
+NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
+NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
}
/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
@@ -116,17 +127,17 @@ typedef struct ncclSimInfo_v22200 {
} ncclSimInfo_t;
/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
- * Not initialized simInfo will result in NCCL error. */
+* Not initialized simInfo will result in NCCL error. */
#define NCCL_SIM_INFO_INITIALIZER { \
- sizeof(ncclSimInfo_t), /* size */ \
- 0x74685283, /* magic */ \
- NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
- NCCL_UNDEF_FLOAT /* estimated time */ \
+sizeof(ncclSimInfo_t), /* size */ \
+0x74685283, /* magic */ \
+NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
+NCCL_UNDEF_FLOAT /* estimated time */ \
}
/* NCCL malloc and free function for all types of NCCL optimizations
- * (e.g. user buffer registration). The actual allocated size might
- * be larger than requested due to granularity requirement. */
+* (e.g. user buffer registration). The actual allocated size might
+* be larger than requested due to granularity requirement. */
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
ncclResult_t pncclMemAlloc(void** ptr, size_t size);
@@ -134,46 +145,57 @@ ncclResult_t ncclMemFree(void *ptr);
ncclResult_t pncclMemFree(void *ptr);
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
- * This integer is coded with the MAJOR, MINOR and PATCH level of the
- * NCCL library
- */
+* This integer is coded with the MAJOR, MINOR and PATCH level of the
+* NCCL library
+*/
ncclResult_t ncclGetVersion(int *version);
ncclResult_t pncclGetVersion(int *version);
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
- * called once and the Id should be distributed to all ranks in the
- * communicator before calling ncclCommInitRank. */
+* called once and the Id should be distributed to all ranks in the
+* communicator before calling ncclCommInitRank. */
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/* Create a new communicator (multi thread/process version) with a configuration
- * set by users. */
+* set by users. */
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
/* Creates a new communicator (multi thread/process version).
- * rank must be between 0 and nranks-1 and unique within a communicator clique.
- * Each rank is associated to a CUDA device, which has to be set before calling
- * ncclCommInitRank.
- * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
- * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+* rank must be between 0 and nranks-1 and unique within a communicator clique.
+* Each rank is associated to a CUDA device, which has to be set before calling
+* ncclCommInitRank.
+* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t ncclCommInitNewRank(ncclComm_t* comm, int nRanks);
+ncclResult_t ncclCommAddNewRank(ncclComm_t comm);
+ncclResult_t ncclCommExportInfo(ncclComm_t comm, ncclUniqueId* commId, ncclCommInfo* commInfo);
+ncclResult_t ncclCommSetupNewRank(ncclComm_t comm);
+
+ncclResult_t ncclBootstrapBroadcast(ncclComm_t comm, int root, void* buffer, size_t size);
+
/* Creates a clique of communicators (single process version).
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
+* This is a convenience function to create a single-process communicator clique.
+* Returns an array of ndev newly initialized communicators in comm.
+* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+* If devlist is NULL, the first ndev CUDA devices are used.
+* Order of devlist defines user-order of processors within the communicator. */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+/* Restore a specific rank from GPU reset and clear hanged operations in other rank. */
+ncclResult_t ncclRestoreRank(ncclComm_t comm, int rank);
+ncclResult_t pncclRestoreRank(ncclComm_t comm, int rank);
+
/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
- * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
- * when the communicator is globally quiescent and related resources are freed; then,
- * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
- * itself) without blocking. */
+* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+* when the communicator is globally quiescent and related resources are freed; then,
+* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+* itself) without blocking. */
ncclResult_t ncclCommFinalize(ncclComm_t comm);
ncclResult_t pncclCommFinalize(ncclComm_t comm);
@@ -182,32 +204,34 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm);
ncclResult_t pncclCommDestroy(ncclComm_t comm);
/* Frees resources associated with communicator object and aborts any operations
- * that might still be running on the device. */
+* that might still be running on the device. */
ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
+ncclResult_t ncclRemoveRank(ncclComm_t comm, int rank);
+
/* Creates one or more communicators from an existing one.
- * Ranks with the same color will end up in the same communicator.
- * Within the new communicator, key will be used to order ranks.
- * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
- * and will therefore return a NULL communicator.
- * If config is NULL, the new communicator will inherit the original communicator's
- * configuration*/
+* Ranks with the same color will end up in the same communicator.
+* Within the new communicator, key will be used to order ranks.
+* NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+* and will therefore return a NULL communicator.
+* If config is NULL, the new communicator will inherit the original communicator's
+* configuration*/
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/* Shrink existing communicator.
- * Ranks in excludeRanksList will be removed form the existing communicator.
- * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
- * If config is NULL, the new communicator will inherit the original communicator's configuration
- * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
+* Ranks in excludeRanksList will be removed form the existing communicator.
+* Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
+* If config is NULL, the new communicator will inherit the original communicator's configuration
+* The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
- * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
- * The number of ncclUniqueIds and their order must be the same for every rank.
- */
+* Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
+* The number of ncclUniqueIds and their order must be the same for every rank.
+*/
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
@@ -258,153 +282,153 @@ ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
/* Reduction operation selector */
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
typedef enum { ncclSum = 0,
- ncclProd = 1,
- ncclMax = 2,
- ncclMin = 3,
- ncclAvg = 4,
- /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+ ncclProd = 1,
+ ncclMax = 2,
+ ncclMin = 3,
+ ncclAvg = 4,
+ /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
* serves as the least possible value for dynamic ncclRedOp_t's
* as constructed by ncclRedOpCreate*** functions. */
- ncclNumOps = 5,
- /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+ ncclNumOps = 5,
+ /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
* It is defined to be the largest signed value (since compilers
* are permitted to use signed enums) that won't grow
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
* maintain ABI compatibility. */
- ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
- } ncclRedOp_t;
+ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+ } ncclRedOp_t;
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
- ncclUint8 = 1,
- ncclInt32 = 2, ncclInt = 2,
- ncclUint32 = 3,
- ncclInt64 = 4,
- ncclUint64 = 5,
- ncclFloat16 = 6, ncclHalf = 6,
- ncclFloat32 = 7, ncclFloat = 7,
- ncclFloat64 = 8, ncclDouble = 8,
- ncclBfloat16 = 9,
- ncclFloat8e4m3 = 10,
- ncclFloat8e5m2 = 11,
- ncclNumTypes = 12
+ ncclUint8 = 1,
+ ncclInt32 = 2, ncclInt = 2,
+ ncclUint32 = 3,
+ ncclInt64 = 4,
+ ncclUint64 = 5,
+ ncclFloat16 = 6, ncclHalf = 6,
+ ncclFloat32 = 7, ncclFloat = 7,
+ ncclFloat64 = 8, ncclDouble = 8,
+ ncclBfloat16 = 9,
+ ncclFloat8e4m3 = 10,
+ ncclFloat8e5m2 = 11,
+ ncclNumTypes = 12
} ncclDataType_t;
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
typedef enum {
- /* ncclScalarDevice: The scalar is in device-visible memory and will be
- * dereferenced while the collective is running. */
- ncclScalarDevice = 0,
+/* ncclScalarDevice: The scalar is in device-visible memory and will be
+* dereferenced while the collective is running. */
+ncclScalarDevice = 0,
- /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
- * dereferenced before the ncclRedOpCreate***() function returns. */
- ncclScalarHostImmediate = 1
+/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+* dereferenced before the ncclRedOpCreate***() function returns. */
+ncclScalarHostImmediate = 1
} ncclScalarResidence_t;
/*
- * ncclRedOpCreatePreMulSum
- *
- * Creates a new reduction operator which pre-multiplies input values by a given
- * scalar locally before reducing them with peer values via summation. For use
- * only with collectives launched against *comm* and *datatype*. The
- * *residence* argument indicates how/when the memory pointed to by *scalar*
- * will be dereferenced. Upon return, the newly created operator's handle
- * is stored in *op*.
- */
+* ncclRedOpCreatePreMulSum
+*
+* Creates a new reduction operator which pre-multiplies input values by a given
+* scalar locally before reducing them with peer values via summation. For use
+* only with collectives launched against *comm* and *datatype*. The
+* *residence* argument indicates how/when the memory pointed to by *scalar*
+* will be dereferenced. Upon return, the newly created operator's handle
+* is stored in *op*.
+*/
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
/*
- * ncclRedOpDestroy
- *
- * Destroys the reduction operator *op*. The operator must have been created by
- * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
- * destroyed as soon as the last NCCL function which is given that operator returns.
- */
+* ncclRedOpDestroy
+*
+* Destroys the reduction operator *op*. The operator must have been created by
+* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+* destroyed as soon as the last NCCL function which is given that operator returns.
+*/
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
/*
- * Collective communication operations
- *
- * Collective communication operations must be called separately for each
- * communicator in a communicator clique.
- *
- * They return when operations have been enqueued on the CUDA stream.
- *
- * Since they may perform inter-CPU synchronization, each call has to be done
- * from a different thread or process, or need to use Group Semantics (see
- * below).
- */
+* Collective communication operations
+*
+* Collective communication operations must be called separately for each
+* communicator in a communicator clique.
+*
+* They return when operations have been enqueued on the CUDA stream.
+*
+* Since they may perform inter-CPU synchronization, each call has to be done
+* from a different thread or process, or need to use Group Semantics (see
+* below).
+*/
/*
- * Reduce
- *
- * Reduces data arrays of length count in sendbuff into recvbuff using op
- * operation.
- * recvbuff may be NULL on all calls except for root device.
- * root is the rank (not the CUDA device) where data will reside after the
- * operation is complete.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* Reduce
+*
+* Reduces data arrays of length count in sendbuff into recvbuff using op
+* operation.
+* recvbuff may be NULL on all calls except for root device.
+* root is the rank (not the CUDA device) where data will reside after the
+* operation is complete.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
/*
- * (deprecated) Broadcast (in-place)
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * This operation is implicitely in place.
- */
+* (deprecated) Broadcast (in-place)
+*
+* Copies count values from root to all other devices.
+* root is the rank (not the CUDA device) where data resides before the
+* operation is started.
+*
+* This operation is implicitely in place.
+*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
/*
- * Broadcast
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* Broadcast
+*
+* Copies count values from root to all other devices.
+* root is the rank (not the CUDA device) where data resides before the
+* operation is started.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
/*
- * All-Reduce
- *
- * Reduces data arrays of length count in sendbuff using op operation, and
- * leaves identical copies of result on each recvbuff.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
+* All-Reduce
+*
+* Reduces data arrays of length count in sendbuff using op operation, and
+* leaves identical copies of result on each recvbuff.
+*
+* In-place operation will happen if sendbuff == recvbuff.
+*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
/*
- * Reduce-Scatter
- *
- * Reduces data in sendbuff using op operation and leaves reduced result
- * scattered over the devices so that recvbuff on rank i will contain the i-th
- * block of the result.
- * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
- * should have a size of at least nranks*recvcount elements.
- *
- * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
- */
+* Reduce-Scatter
+*
+* Reduces data in sendbuff using op operation and leaves reduced result
+* scattered over the devices so that recvbuff on rank i will contain the i-th
+* block of the result.
+* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+* should have a size of at least nranks*recvcount elements.
+*
+* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream);
@@ -413,101 +437,101 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
cudaStream_t stream);
/*
- * All-Gather
- *
- * Each device gathers sendcount values from other GPUs into recvbuff,
- * receiving data from rank i at offset i*sendcount.
- * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
- * should have a size of at least nranks*sendcount elements.
- *
- * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
- */
+* All-Gather
+*
+* Each device gathers sendcount values from other GPUs into recvbuff,
+* receiving data from rank i at offset i*sendcount.
+* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+* should have a size of at least nranks*sendcount elements.
+*
+* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
/*
- * Send
- *
- * Send data from sendbuff to rank peer.
- *
- * Rank peer needs to call ncclRecv with the same datatype and the same count from this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
+* Send
+*
+* Send data from sendbuff to rank peer.
+*
+* Rank peer needs to call ncclRecv with the same datatype and the same count from this
+* rank.
+*
+* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+* ncclGroupEnd section.
+*/
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
/*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
- *
- * Rank peer needs to call ncclSend with the same datatype and the same count to this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
+* Receive
+*
+* Receive data from rank peer into recvbuff.
+*
+* Rank peer needs to call ncclSend with the same datatype and the same count to this
+* rank.
+*
+* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+* ncclGroupEnd section.
+*/
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
/*
- * Group semantics
- *
- * When managing multiple GPUs from a single thread, and since NCCL collective
- * calls may perform inter-CPU synchronization, we need to "group" calls for
- * different ranks/devices into a single call.
- *
- * Grouping NCCL calls as being part of the same collective operation is done
- * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
- * collective calls until the ncclGroupEnd call, which will wait for all calls
- * to be complete. Note that for collective communication, ncclGroupEnd only
- * guarantees that the operations are enqueued on the streams, not that
- * the operation is effectively done.
- *
- * Both collective communication and ncclCommInitRank can be used in conjunction
- * of ncclGroupStart/ncclGroupEnd, but not together.
- *
- * Group semantics also allow to fuse multiple operations on the same device
- * to improve performance (for aggregated collective calls), or to permit
- * concurrent progress of multiple send/receive operations.
- */
+* Group semantics
+*
+* When managing multiple GPUs from a single thread, and since NCCL collective
+* calls may perform inter-CPU synchronization, we need to "group" calls for
+* different ranks/devices into a single call.
+*
+* Grouping NCCL calls as being part of the same collective operation is done
+* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+* collective calls until the ncclGroupEnd call, which will wait for all calls
+* to be complete. Note that for collective communication, ncclGroupEnd only
+* guarantees that the operations are enqueued on the streams, not that
+* the operation is effectively done.
+*
+* Both collective communication and ncclCommInitRank can be used in conjunction
+* of ncclGroupStart/ncclGroupEnd, but not together.
+*
+* Group semantics also allow to fuse multiple operations on the same device
+* to improve performance (for aggregated collective calls), or to permit
+* concurrent progress of multiple send/receive operations.
+*/
/*
- * Group Start
- *
- * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
- * a single NCCL operation. Nothing will be started on the CUDA stream until
- * ncclGroupEnd.
- */
+* Group Start
+*
+* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+* a single NCCL operation. Nothing will be started on the CUDA stream until
+* ncclGroupEnd.
+*/
ncclResult_t ncclGroupStart();
ncclResult_t pncclGroupStart();
/*
- * Group End
- *
- * End a group call. Start a fused NCCL operation consisting of all calls since
- * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
- * need to be called after ncclGroupEnd.
- */
+* Group End
+*
+* End a group call. Start a fused NCCL operation consisting of all calls since
+* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+* need to be called after ncclGroupEnd.
+*/
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
/*
- * Group Simulate End
- *
- * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
- */
+* Group Simulate End
+*
+* Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
+*/
ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
new file mode 100644
@@ -0,0 +1,673 @@
+#include "alloc.h"
+#include "bootstrap.h"
+#include "channel.h"
+#include "checks.h"
+#include "coll_net.h"
+#include "enqueue.h"
+#include "graph.h"
+#include "graph/topo.h"
+#include "group.h"
+#include "nccl.h"
+#include "nccl_common.h"
+#include "serialize.h"
+#include "transport.h"
+#include "tuner.h"
+#include "lighthouse.h"
+#include <cassert>
+#include <cstdlib>
+#include <scale.h>
+
+#include "argcheck.h"
+#include "bootstrap.h"
+#include "channel.h"
+#include "coll_net.h"
+#include "enqueue.h"
+#include "gdrwrap.h"
+#include "graph.h"
+#include "graph/topo.h"
+#include "group.h"
+#include "nccl.h"
+#include "net.h"
+#include "nvmlwrap.h"
+#include "param.h"
+#include "transport.h"
+#include "tuner.h"
+#include <assert.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#define DEFAULT_LL_BUFFSIZE \
+ (NCCL_LL_LINES_PER_THREAD * NCCL_LL_MAX_NTHREADS * NCCL_STEPS * \
+ sizeof(union ncclLLFifoLine))
+#define DEFAULT_LL128_BUFFSIZE \
+ (NCCL_LL128_ELEMS_PER_THREAD * NCCL_LL128_MAX_NTHREADS * NCCL_STEPS * \
+ sizeof(uint64_t))
+#define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
+#define BOOTSTRAP_TAG_ADD_RANK (0x1 << 27)
+
+NCCL_PARAM(BuffSize1, "BUFFSIZE", -2);
+NCCL_PARAM(LlBuffSize1, "LL_BUFFSIZE", -2);
+NCCL_PARAM(Ll128BuffSize1, "LL128_BUFFSIZE", -2);
+
+NCCL_PARAM(P2pNetChunkSize1, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
+NCCL_PARAM(P2pPciChunkSize1, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
+NCCL_PARAM(P2pNvlChunkSize1, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
+
+struct ncclCommAddRankAsyncJob {
+ struct ncclAsyncJob base;
+ ncclComm_t comm;
+ // for ncclCommAddNewRank & ncclCommInitNewRank
+ // ncclNewRankInfo *newRankInfo;
+ ncclCommTrans *newRankCommTrans;
+ // for ncclCommInitNewRank
+ struct ncclCommInfoInternal *peerInfo;
+ ncclUniqueId *commId;
+};
+#define BOOTSTRAP_HANDLE(h, i) \
+ ((struct ncclBootstrapHandle *)((char *)h + i * NCCL_UNIQUE_ID_BYTES))
+
+void print_socket_info(int fd) {
+ struct sockaddr_in local_addr, peer_addr;
+ socklen_t addr_len = sizeof(struct sockaddr_in);
+ char ip_str[INET_ADDRSTRLEN];
+
+ // 获取本地地址信息
+ if (getsockname(fd, (struct sockaddr *)&local_addr, &addr_len) == -1) {
+ perror("getsockname failed");
+ return;
+ }
+
+ // 转换IP地址为字符串格式
+ inet_ntop(AF_INET, &local_addr.sin_addr, ip_str, INET_ADDRSTRLEN);
+ printf("本地地址: %s\n", ip_str);
+ printf("本地端口: %d\n", ntohs(local_addr.sin_port));
+
+ // 获取远程地址信息
+ if (getpeername(fd, (struct sockaddr *)&peer_addr, &addr_len) == -1) {
+ perror("getpeername failed");
+ return;
+ }
+
+ // 转换IP地址为字符串格式
+ inet_ntop(AF_INET, &peer_addr.sin_addr, ip_str, INET_ADDRSTRLEN);
+ printf("远程地址: %s\n", ip_str);
+ printf("远程端口: %d\n", ntohs(peer_addr.sin_port));
+}
+// 检查 fd 的阻塞状态
+void check_fd_blocking(int fd) {
+ // 1. 获取 fd 的文件状态标志
+ int flags = fcntl(fd, F_GETFL);
+ if (flags == -1) { // 调用失败(如 fd 无效)
+ perror("fcntl(F_GETFL) failed");
+ return;
+ }
+
+ // 2. 判断 O_NONBLOCK 标志是否存在
+ if (flags & O_NONBLOCK) {
+ printf("fd = %d: 非阻塞模式(Non-Blocking)\n", fd);
+ } else {
+ printf("fd = %d: 阻塞模式(Blocking)\n", fd);
+ }
+}
+
+#define TIMER_INIT_TOTAL 0
+#define TIMER_INIT_KERNELS 1
+#define TIMER_INIT_BOOTSTRAP 2
+#define TIMER_INIT_ALLGATHER 3
+#define TIMER_INIT_TOPO 4
+#define TIMER_INIT_GRAPHS 5
+#define TIMER_INIT_CONNECT 6
+#define TIMER_INIT_ALLOC 7
+#define TIMERS_INIT_COUNT 8
+
+static ncclResult_t ncclCommInitNewRankFunc(struct ncclAsyncJob *job_) {
+ uint64_t timers[TIMERS_INIT_COUNT] = {0};
+ ncclResult_t result = ncclSuccess;
+ struct ncclCommAddRankAsyncJob *job = (struct ncclCommAddRankAsyncJob *)job_;
+ ncclComm_t comm = job->comm;
+ union ncclSocketAddress *newRankAddr;
+ int cudaDev = comm->cudaDev;
+ // ncclNewRankInfo *newRankInfo = job->newRankInfo;
+ // ncclComm_t peerCommInfo = job->peerInfo->comm; ncclComm_t 是 ncclComm
+ // 的指针 ncclCommTrans* peerCommInfo = job->newRankCommTrans;
+ int nRanks = comm->nRanks;
+ int myRank = nRanks - 1;
+ size_t maxLocalSizeBytes = 0;
+ int cudaArch;
+ int archMajor, archMinor;
+ unsigned long long commIdHash;
+ struct bootstrapState *state;
+ int maxSharedMem = 0;
+ int rank = nRanks - 1;
+ uint64_t bootstrapTime;
+ uint64_t magic;
+ struct LhTxn* lhTxn = NULL;
+ struct LhState* lhState = NULL;
+ union ncclSocketAddress nextRankAddr;
+ // NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), result, fail);
+ // memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
+ timers[TIMER_INIT_TOTAL] = clockNano();
+ timers[TIMER_INIT_BOOTSTRAP] = clockNano();
+ NCCLCHECK(ncclCalloc(&state, 1));
+ comm->bootstrap = state;
+ state->abortFlag = comm->abortFlag;
+ state->magic = comm->magic = magic;
+ // comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic;
+ newRankAddr =
+ (union ncclSocketAddress *)malloc(sizeof(union ncclSocketAddress));
+ NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket),
+ newRankAddr, ncclSocketTypeBootstrap));
+ // NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state,
+ // socket), newRankAddr, ncclSocketTypeBootstrap));
+
+ if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnBegin failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (txnLoad(lhTxn, &lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnLoad failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ getMagic(lhState, &magic);
+ setNewRank(lhState, newRankAddr, rank);
+ if (queryNextRankAddrNew(lhState, &nextRankAddr) != 0) {
+ fprintf(stderr, "lighthouse: queryNextRankAddrNew failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ updateVersion(lhState);
+ printLhState(lhState);
+ if (txnSave(lhTxn, lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnSave failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (txnEnd(lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnEnd failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+
+ state->magic = comm->magic = magic;
+ INFO(NCCL_INIT, "Rank %d is waiting for connection from prev rank %d...\n", rank, rank - 1);
+ INFO(NCCL_INIT, "Rank %d is connecting to next rank %d...\n", rank, 0);
+ NCCLCHECK(socketRingConnect(
+ &nextRankAddr, &STATE_RING(state, socket.send),
+ &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv),
+ comm->magic,
+ state->abortFlag));
+ INFO(NCCL_INIT, "Rank %d is connected...\n", rank - 1);
+ INFO(NCCL_INIT, "Rank %d is connected...\n", 0);
+ timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
+ timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+ CUDACHECKGOTO(cudaSetDevice(cudaDev), result, fail);
+ CUDACHECKGOTO(cudaDeviceGetAttribute(
+ &archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev),
+ result, fail);
+ CUDACHECKGOTO(cudaDeviceGetAttribute(
+ &archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev),
+ result, fail);
+ CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem,
+ cudaDevAttrMaxSharedMemoryPerBlockOptin,
+ cudaDev),
+ result, fail);
+ cudaArch = 100 * archMajor + 10 * archMinor;
+ timers[TIMER_INIT_KERNELS] = clockNano();
+
+ NCCLCHECK(
+ ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
+ // Set the maximum kernel stack size of all kernels to avoid
+ // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
+ if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
+ TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes);
+ CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
+ }
+ timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
+ ncclSocketRecv(&STATE_RING(state, socket.send), &(comm->commHash),
+ sizeof(uint64_t));
+ //
+ // 1. 计算所需buffer大小
+ state->nranks = comm->nRanks;
+ // newBufferSize = ncclInfoSerializeSize(comm);
+ // comm->nRanks = comm->nRanks - 1;
+ // state->nranks = comm->nRanks;
+ // oldBufferSize = ncclInfoSerializeSize(comm);
+ // comm->nRanks = comm->nRanks + 1;
+ // state->nranks = comm->nRanks;
+ // // 2. 按需分配buffer(精确大小)
+ // buffer = new char[oldBufferSize];
+ // check_fd_blocking(STATE_RING(state, socket.send).fd);
+ // check_fd_blocking(STATE_RING(state, socket.recv).fd);
+ // print_socket_info(STATE_RING(state, socket.send).fd);
+ // print_socket_info(STATE_RING(state, socket.recv).fd);
+ // ncclSocketRecv(&STATE_RING(state, socket.send),buffer,oldBufferSize);
+ // ncclComm* newComm = new ncclComm();
+ // memset(newComm, 0, sizeof(ncclComm)); // 确保指针初始为null
+ // 2. 反序列化(内部会按需分配内存)
+ // INFO(NCCL_INIT, "ncclInfoDeserialize before:");
+ // ncclInfoDeserialize((struct ncclCommTrans *)buffer);
+ // INFO(NCCL_INIT, "ncclInfoDeserialize after:");
+ // obtain a unique hash using the first commId
+ // comm->commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+ // commId = job->commId;
+ // comm->commHash = commIdHash = getHash(commId->internal,
+ // NCCL_UNIQUE_ID_BYTES);
+ commIdHash = comm->commHash;
+ nRanks = comm->nRanks; // 对的对的对的;统一放在某个位置
+ myRank = comm->nRanks - 1;
+ comm->rank = myRank;
+ INFO(NCCL_INIT,
+ "%s comm %p rank %d nRanks %d cudaDev %d nvmlDev %d busId %lx commId "
+ "0x%llx - Init START",
+ __func__, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev,
+ comm->busId, commIdHash);
+ timers[TIMER_INIT_ALLOC] = clockNano();
+ NCCLCHECKGOTO(commAlloc(comm, NULL, nRanks, myRank), result,
+ fail); // 这里可能有if else
+ timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+ bootstrapTime = clockNano();
+ NCCLCHECKGOTO(bootstrapInitNew(comm, true), result, fail);
+ bootstrapTime = clockNano() - bootstrapTime;
+ timers[TIMER_INIT_BOOTSTRAP] = timers[TIMER_INIT_BOOTSTRAP] + bootstrapTime;
+ comm->cudaArch = cudaArch;
+
+ // timers[TIMER_INIT_ALLGATHER] = clockNano();
+ NCCLCHECKGOTO(initTransportsRank(comm, nullptr, timers), result, fail);
+ // NCCLCHECKGOTO(initTransportsRank(comm, (struct ncclCommTrans*)buffer),
+ // result, fail); timers[TIMER_INIT_ALLGATHER] = clockNano() -
+ // timers[TIMER_INIT_ALLGATHER];
+
+ NCCLCHECKGOTO(ncclTunerPluginLoad(comm), result, fail);
+ if (comm->tuner) {
+ NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog,
+ &comm->tunerContext));
+ }
+ comm->initState = ncclSuccess;
+
+ // // 2. 按需分配buffer(精确大小)
+ // buffer = new char[newBufferSize];
+ // // 3. 序列化
+ // ncclInfoSerialize(buffer, comm);
+ // ncclSocketSend(&STATE_RING(state, socket.send),buffer,newBufferSize);
+ timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL];
+
+ INFO(NCCL_INIT | NCCL_PROFILE,
+ "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc "
+ "%.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
+ "connections %.2f, rest %.2f)",
+ "ncclCommInitNewRankFunc", comm->rank, comm->nRanks,
+ timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9,
+ timers[TIMER_INIT_ALLOC] / 1e9, timers[TIMER_INIT_BOOTSTRAP] / 1e9,
+ timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
+ timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9,
+ timers[TIMER_INIT_TOTAL] / 1e9);
+
+exit:
+ return result;
+fail:
+ comm->initState = result;
+ if (lhTxn)
+ free(lhTxn);
+ if (lhState)
+ free(lhState);
+ goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitNewRank, ncclComm_t *comm,
+ int nRanks);
+ncclResult_t ncclCommInitNewRank(ncclComm_t *newcomm,
+ int nRanks) {
+ ncclResult_t result = ncclSuccess;
+ // ncclCommInfoInternal *peerInfo = (ncclCommInfoInternal
+ // *)commInfo->internal;
+ int cudaDev = -1;
+ ncclComm_t comm = NULL;
+
+ // ncclInfoDeserialize(peerInfo);
+ // Load the CUDA driver and dlsym hooks (can fail on old drivers)
+ (void)ncclCudaLibraryInit();
+
+ ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+ CUDACHECKGOTO(cudaGetDevice(&cudaDev), result, fail);
+ // first call ncclInit, this will setup the environment
+ NCCLCHECKGOTO(ncclInit(), result, fail);
+
+ // Make sure the CUDA runtime is initialized.
+ CUDACHECKGOTO(cudaFree(NULL), result, fail);
+
+ NCCLCHECKGOTO(ncclCalloc(&comm, 1), result, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), result, fail);
+ NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), result, fail);
+ NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), result, fail);
+ comm->startMagic = comm->endMagic =
+ NCCL_MAGIC; // Used to detect comm corruption.
+ *comm->abortFlagRefCount = 1;
+ comm->cudaDev = cudaDev;
+ NCCLCHECKGOTO(parseCommConfig(comm, &config), result, fail);
+ /* start with ncclInternalError and will be changed to ncclSuccess if init
+ * succeeds. */
+ comm->initState = ncclInternalError;
+ comm->nRanks = nRanks;
+ *newcomm = comm;
+
+ struct ncclCommAddRankAsyncJob *job;
+ NCCLCHECKGOTO(ncclCalloc(&job, 1), result, fail);
+ job->comm = comm;
+ // job->newRankInfo = newRankInfo;
+ // job->peerInfo = peerInfo;
+ NCCLCHECKGOTO(
+ ncclAsyncLaunch(&job->base, ncclCommInitNewRankFunc, NULL, free, comm),
+ result, fail);
+
+exit:
+ return ncclGroupErrCheck(result);
+fail:
+ if (comm) {
+ free(comm->abortFlag);
+ if (comm->abortFlagDev)
+ (void)ncclCudaHostFree((void *)comm->abortFlagDev);
+ free(comm->abortFlagRefCount);
+ free(comm);
+ }
+ if (newcomm)
+ *newcomm = NULL;
+ goto exit;
+}
+
+
+static ncclResult_t computeBuffSizes(struct ncclComm *comm) {
+ int64_t envs[NCCL_NUM_PROTOCOLS] = {
+ ncclParamLlBuffSize1(), ncclParamLl128BuffSize1(), ncclParamBuffSize1()};
+ int defaults[NCCL_NUM_PROTOCOLS] = {DEFAULT_LL_BUFFSIZE,
+ DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE};
+
+ for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
+ comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
+ }
+
+ if (comm->nNodes > 1)
+ comm->p2pChunkSize = ncclParamP2pNetChunkSize1();
+ else if (comm->isAllNvlink)
+ comm->p2pChunkSize = ncclParamP2pNvlChunkSize1();
+ else
+ comm->p2pChunkSize = ncclParamP2pPciChunkSize1();
+
+ // Make sure P2P chunksize is not larger than coll chunksize.
+ if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE])
+ comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+
+ if (comm->sharedRes->owner != comm) {
+ /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
+ comm->p2pChunkSize =
+ std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
+ } else {
+ comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize;
+ }
+
+ INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize);
+ return ncclSuccess;
+}
+struct NvtxParamsCommInitRank {
+ int rank;
+ int nRanks;
+ int cudaDev;
+};
+constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
+ {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
+ {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0,
+ offsetof(NvtxParamsCommInitRank, nRanks)},
+ {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0,
+ offsetof(NvtxParamsCommInitRank, cudaDev)},
+};
+struct graphInfo {
+ int pattern;
+ int nChannels;
+ int sameChannels;
+ float bwIntra;
+ float bwInter;
+ int typeIntra;
+ int typeInter;
+ int crossNic;
+};
+struct allGatherInfo {
+ struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
+ struct ncclTopoRanks topoRanks;
+ int cpuArch;
+ int cpuVendor;
+};
+
+
+void broadcast(ncclComm_t comm, int my_rank, int num_ranks, void *data,
+ int data_size) {
+ // 如果是0号进程,一开始就有数据,不需要接收
+ if (my_rank != 0) {
+ // 初始化数据缓冲区(实际应用中可能需要更复杂的处理)
+ // 这里假设data指向已分配的缓冲区
+ }
+
+ // 计算需要的轮数:最大rank编号的二进制位数
+ int max_rank = num_ranks - 1;
+ int rounds = 0;
+ while ((1 << rounds) <= max_rank) {
+ rounds++;
+ }
+
+ // 执行每一轮的通信
+ for (int round = 0; round < rounds; round++) {
+ // 当前轮次的步长:2^round
+ int step = 1 << round;
+
+ // 发送方:rank < step的进程在本轮可能需要发送数据
+ if (my_rank < step) {
+ int dest = my_rank + step;
+ // 检查目标rank是否有效
+ if (dest < num_ranks) {
+ // 发送数据到目标rank
+ bootstrapSend(comm->bootstrap, dest, BOOTSTRAP_TAG_ADD_RANK, data,
+ data_size);
+ // 可以在这里添加调试信息
+ // printf("Round %d: rank %d sends to rank %d\n", round + 1, my_rank,
+ // dest);
+ }
+ }
+
+ // 接收方:rank >= step且rank - step < step的进程在本轮需要接收数据
+ if (my_rank >= step && (my_rank - step) < step) {
+ int source = my_rank - step;
+ // 从源rank接收数据
+ bootstrapRecv(comm->bootstrap, source, BOOTSTRAP_TAG_ADD_RANK, data,
+ data_size);
+ // 可以在这里添加调试信息
+ // printf("Round %d: rank %d receives from rank %d\n", round + 1, my_rank,
+ // source);
+ }
+ }
+}
+
+NCCL_API(ncclResult_t, ncclCommAddNewRank, ncclComm_t comm);
+ncclResult_t
+ncclCommAddNewRank(ncclComm_t comm) {
+ ncclResult_t result = ncclSuccess;
+ // ncclNewRankInfoInternal *info = (ncclNewRankInfoInternal
+ // *)newRankInfo->internal; ncclInfoDeserialize(info); INFO(NCCL_INIT, "Adding
+ // new rank %d to comm %p with peerInfo", comm->rank, comm);
+ struct ncclCommAddRankAsyncJob *job;
+ struct bootstrapState *state = (struct bootstrapState *)comm->bootstrap;
+ uint64_t expectedVersion = 4;
+ struct LhTxn* lhTxn = NULL;
+ struct LhState* lhState = NULL;
+ union ncclSocketAddress nextRankAddr;
+ uint64_t timers[TIMERS_INIT_COUNT] = {0};
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ comm->nRanks++;
+ // size_t oldBufferSize;
+ // size_t newBufferSize;
+ // // 1. 计算所需buffer大小
+ // // 2. 按需分配buffer(精确大小)
+ // char* buffer;//全局唯一也可以
+ // oldBufferSize = ncclInfoSerializeSize(comm);
+ // comm->nRanks = comm->nRanks + 1;
+ // state->nranks = comm->nRanks;
+ // newBufferSize = ncclInfoSerializeSize(comm);// 分配多了,更好
+ // comm->nRanks = comm->nRanks - 1;//必须减
+ // state->nranks = comm->nRanks;
+ // buffer = new char[oldBufferSize];
+ NCCLCHECKGOTO(ncclCalloc(&job, 1), result, fail);
+ job->comm = comm; // 也会有一次拷贝
+ // job->newRankInfo = newRankInfo;
+
+ if (rank == nranks - 1) {
+ printf("Rank %d is waiting for lighthouse version %lu...\n", rank, expectedVersion);
+ if (txnWaitForVersion(LH_STATE_PATH, expectedVersion, /*timeout_ms*/-1) != 0) {
+ fprintf(stderr, "lighthouse: txnWaitForVersion failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (txnBegin(LH_STATE_PATH, 1, &lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnBegin failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (txnLoad(lhTxn, &lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnLoad failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ uint64_t version;
+ getVersion(lhState, &version);
+ if (version != expectedVersion) {
+ fprintf(stderr, "lighthouse: version mismatch, expected %lu but got %lu\n", expectedVersion, version);
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (queryNextRankAddrLast(lhState, &nextRankAddr) != 0) {
+ fprintf(stderr, "lighthouse: queryNextRankAddrLast failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ printLhState(lhState);
+ if (txnSave(lhTxn, lhState) != 0) {
+ fprintf(stderr, "lighthouse: txnSave failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ if (txnEnd(lhTxn) != 0) {
+ fprintf(stderr, "lighthouse: txnEnd failed");
+ result = ncclInternalError;
+ goto fail;
+ }
+ }
+
+ if (rank == 0) {
+ INFO(NCCL_INIT, "Rank %d is waiting for connection from prev rank %d...\n", rank, comm->nRanks - 1);
+ NCCLCHECK(socketRingConnectNext(&STATE_LISTEN(state, socket),
+ &STATE_RING(state, socket.recv)));
+ INFO(NCCL_INIT, "Rank %d is connected...\n", comm->nRanks - 1);
+ }
+ if (rank == nranks - 1) {
+ INFO(NCCL_INIT, "Rank %d is connecting to next rank %d...\n", rank, comm->nRanks - 1);
+ NCCLCHECK(socketRingConnectPrev(&nextRankAddr,
+ &STATE_RING(state, socket.send),
+ comm->magic, state->abortFlag));
+ INFO(NCCL_INIT, "Rank %d is connected...\n", comm->nRanks - 1);
+ }
+
+ if (comm->rank == 0) {
+ ncclSocketSend(&STATE_RING(state, socket.recv), &(comm->commHash),
+ sizeof(uint64_t));
+ }
+ NCCLCHECKGOTO(commAllocNew(comm, nullptr, comm->nRanks, comm->rank), result,
+ fail);
+ NCCLCHECKGOTO(bootstrapInitNew(comm, false), result, fail);
+ NCCLCHECKGOTO(updateTransportsRank(comm, nullptr, timers), result, fail);
+
+ret:
+ return result;
+fail:
+ if (lhTxn)
+ free(lhTxn);
+ if (lhState)
+ free(lhState);
+ goto ret;
+}
+
+static ncclResult_t ncclCommSetupNewRankFunc(struct ncclAsyncJob *job_) {
+ ncclResult_t res = ncclSuccess;
+ struct ncclCommAddRankAsyncJob *job = (struct ncclCommAddRankAsyncJob *)job_;
+ ncclComm_t comm = job->comm;
+ int cudaDev = comm->cudaDev;
+ int rank = comm->rank;
+ INFO(NCCL_INIT, "Rank %d magic %lu commHash %lu", rank, comm->magic,
+ comm->commHash);
+ INFO(NCCL_INIT, "ncclCommSetupNewRankFunc");
+ CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
+ INFO(NCCL_INIT, "then ncclCommSetupNewRankFunc");
+
+ INFO(NCCL_INIT, "6668888888ncclCommSetupNewRankFunc");
+
+ printf("Rank %d is waiting to receive 1MB data...\n", rank);
+
+ // if (comm->rank != nRanks-1){
+ for (int i = 1; i < comm->nRanks; i++) {
+ // int bootstrapTag = (i << 8) + (&comm->graphs[NCCL_ALGO_RING] ?
+ // (&comm->graphs[NCCL_ALGO_RING])->id + 1 : 0);
+ int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
+ int sendPeer = (comm->rank + i) % comm->nRanks;
+ comm->connectRecv[recvPeer] = 0x0;
+ comm->connectSend[sendPeer] = 0x0;
+ uint64_t recvMask = comm->connectRecv[recvPeer];
+ uint64_t sendMask = comm->connectSend[sendPeer];
+ // INFO(NCCL_INIT,"十六进制(小写): 0x%" PRIx64 "\n", recvMask);
+ INFO(NCCL_INIT,
+ "waini send i %d:两个十六进制值: 0x%" PRIx64 " 0x%" PRIx64 "\n", i,
+ recvMask, sendMask);
+ }
+// for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+// ops = proxyOps + r;
+// //INFO(NCCL_INIT,"ncclProxyStart ops->pool %p ops->nextOps
+// %d",ops->pool,ops->nextOps);
+
+// //ops->pool = nullptr;
+// ops->nextOps = 0;
+// comm->nChannels = 2;
+
+// }
+// }
+// NCCLCHECKGOTO(ncclTransportRingConnectNew(comm), res, fail);
+exit:
+ return res;
+fail:
+ goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommSetupNewRank, ncclComm_t comm);
+ncclResult_t ncclCommSetupNewRank(ncclComm_t comm) {
+ ncclResult_t res = ncclSuccess;
+ struct ncclCommAddRankAsyncJob *job;
+ NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+ job->comm = comm;
+ NCCLCHECKGOTO(
+ ncclAsyncLaunch(&job->base, ncclCommSetupNewRankFunc, NULL, free, comm),
+ res, fail);
+
+exit:
+ return res;
+fail:
+ goto exit;
+}
--
2.43.0