* Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
* MindIE is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
#include "request_single_block_manager.h"
#include <stdexcept>
namespace mindie_llm {
RequestSingleBlockManager::RequestSingleBlockManager(const BlockManagerConfig &config, size_t localDPRank)
: blockSize_(config.cacheBlockSize),
cpuBlockNum_(config.cpuBlockNum),
npuBlockNum_(config.npuBlockNum),
reservedBlockNum_(config.reservedBlockNum),
enableCaching_(config.enableCaching),
rankSize_(config.rankSize),
hostSize_(config.hostSize),
localDPRank_(localDPRank) {
if (rankSize_ == 0 || hostSize_ == 0) {
throw std::invalid_argument("The rank and host size must be greater than 0");
}
if (npuBlockNum_ == 0) {
throw std::invalid_argument("npuBlockNum must be > 0");
}
if (reservedBlockNum_ > npuBlockNum_) {
throw std::invalid_argument("reservedBlockNum must be <= npuBlockNum");
}
AllocatorConfig allocatorConfig;
allocatorConfig.allocatorType = enableCaching_ ? BlockAllocatorType::PREFIXCACHING : BlockAllocatorType::HASHLESS;
allocatorConfig.numCpuBlocks = cpuBlockNum_;
allocatorConfig.numNpuBlocks = npuBlockNum_ - reservedBlockNum_;
allocatorConfig.blockSize = (blockSize_ == 0 ? 1 : blockSize_);
allocatorConfig.rankSize = rankSize_;
allocatorConfig.hostSize = hostSize_;
blockAllocator_ = std::make_shared<CpuNpuBlockAllocator>(allocatorConfig);
}
const RequestId *RequestSingleBlockManager::GetRequestIdBySeqId_(SequenceId seqId) const {
auto it = seqId2RequestId_.find(seqId);
if (it == seqId2RequestId_.end()) {
return nullptr;
}
return &it->second;
}
RequestSingleBlockManager::RequestEntry *RequestSingleBlockManager::GetEntryByRequestId_(const RequestId &rid) {
auto it = requestEntries_.find(rid);
if (it == requestEntries_.end()) {
return nullptr;
}
return &it->second;
}
const RequestSingleBlockManager::RequestEntry *RequestSingleBlockManager::GetEntryByRequestId_(
const RequestId &rid) const {
auto it = requestEntries_.find(rid);
if (it == requestEntries_.end()) {
return nullptr;
}
return &it->second;
}
AllocStatus RequestSingleBlockManager::CanAllocate(const SequenceGroupSPtr &seqGroup) const {
if (!seqGroup) {
return AllocStatus::NEVER;
}
if (seqGroup->GetFirstSequence(SequenceStatus::WAITING).empty()) {
return AllocStatus::NEVER;
}
if (GetEntryByRequestId_(seqGroup->requestId) != nullptr) {
return AllocStatus::OK;
}
if ((npuBlockNum_ - reservedBlockNum_) < 1) {
return AllocStatus::NEVER;
}
return (blockAllocator_->GetNumFreeBlock(DeviceType::NPU, 0) >= 1) ? AllocStatus::OK : AllocStatus::LATER;
}
bool RequestSingleBlockManager::Allocate(const SequenceGroupSPtr &seqGroup) {
if (!seqGroup) {
return false;
}
const auto waitingSeqs = seqGroup->GetFirstSequence(SequenceStatus::WAITING);
if (waitingSeqs.empty()) {
return false;
}
const RequestId &rid = seqGroup->requestId;
auto &entry = requestEntries_[rid];
if (!entry.block) {
std::vector<TokenId> emptyTokens;
entry.block = blockAllocator_->AllocateMutableBlock(DeviceType::NPU, emptyTokens, nullptr, 0, 0);
entry.refCount = 0;
}
for (const auto &seq : waitingSeqs) {
if (!seq) continue;
const SequenceId sid = seq->seqId_;
if (seqId2RequestId_.find(sid) != seqId2RequestId_.end()) {
continue;
}
seqId2RequestId_[sid] = rid;
entry.refCount++;
}
return true;
}
bool RequestSingleBlockManager::CanAppendSlot(const SequenceGroupSPtr &seqGroup) const {
if (!seqGroup) {
return false;
}
return (GetEntryByRequestId_(seqGroup->requestId) != nullptr);
}
std::vector<std::pair<BlockId, BlockId>> RequestSingleBlockManager::AppendSlot(const SequenceSPtr &seq) {
(void)seq;
return {};
}
bool RequestSingleBlockManager::CanAppendSlotNew(const SequenceGroupSPtr &seqGroup) const {
return CanAppendSlot(seqGroup);
}
void RequestSingleBlockManager::AppendSlotNew(const SequenceGroupSPtr &seqGroup) { (void)seqGroup; }
void RequestSingleBlockManager::AppendTokenToLatestRank(SequenceId seqId, const std::vector<TokenId> &tokens) {
(void)seqId;
(void)tokens;
}
void RequestSingleBlockManager::Fork(SequenceSPtr &parentSeq, SequenceSPtr &childSeq) {
if (!parentSeq || !childSeq) {
throw std::invalid_argument("parentSeq/childSeq cannot be null");
}
const RequestId *rid = GetRequestIdBySeqId_(parentSeq->seqId_);
if (!rid) {
throw std::runtime_error("parentSeq has no bound requestId in RequestSingleBlockManager::Fork");
}
auto *entry = GetEntryByRequestId_(*rid);
if (!entry || !entry->block) {
throw std::runtime_error("request entry missing in RequestSingleBlockManager::Fork");
}
seqId2RequestId_[childSeq->seqId_] = *rid;
entry->refCount++;
}
bool RequestSingleBlockManager::CanSwapOut(const SequenceGroupSPtr &seqGroup) {
(void)seqGroup;
return false;
}
std::vector<std::pair<PhysicalBlockId, PhysicalBlockId>> RequestSingleBlockManager::SwapOut(
const SequenceGroupSPtr &seqGroup) {
(void)seqGroup;
return {};
}
AllocStatus RequestSingleBlockManager::CanSwapIn(const SequenceGroupSPtr &seqGroup, size_t numLookheadSlots) {
(void)numLookheadSlots;
(void)seqGroup;
return AllocStatus::NEVER;
}
std::vector<std::pair<PhysicalBlockId, PhysicalBlockId>> RequestSingleBlockManager::SwapIn(
const SequenceGroupSPtr &seqGroup) {
(void)seqGroup;
return {};
}
void RequestSingleBlockManager::Free(SequenceId seqId) {
const RequestId *rid = GetRequestIdBySeqId_(seqId);
if (!rid) {
return;
}
auto itEntry = requestEntries_.find(*rid);
if (itEntry == requestEntries_.end()) {
seqId2RequestId_.erase(seqId);
return;
}
RequestEntry &entry = itEntry->second;
if (entry.refCount > 0) {
entry.refCount--;
}
seqId2RequestId_.erase(seqId);
if (entry.refCount == 0) {
if (entry.block) {
blockAllocator_->Free(entry.block);
}
requestEntries_.erase(itEntry);
}
}
std::vector<BlockIds> RequestSingleBlockManager::GetBlockIds(SequenceId seqId) const {
const RequestId *rid = GetRequestIdBySeqId_(seqId);
if (!rid) {
return {};
}
const auto *entry = GetEntryByRequestId_(*rid);
if (!entry || !entry->block) {
return {};
}
return {{entry->block->GetBlockId()}};
}
void RequestSingleBlockManager::GetRankedBlockIds(SequenceId seqId, std::vector<RankedBlockId> &rankedBlockIds) const {
rankedBlockIds.clear();
const auto allIds = GetBlockIds(seqId);
if (allIds.empty() || allIds[0].empty()) {
return;
}
const auto &ids = allIds[0];
rankedBlockIds.reserve(rankSize_);
for (size_t r = 0; r < rankSize_; ++r) {
rankedBlockIds.push_back(RankedBlockId{ids[0], r});
}
}
void RequestSingleBlockManager::GetRankedBlockIds(SequenceId seqId,
std::vector<std::vector<BlockId>> &rankedBlockIds) const {
rankedBlockIds.clear();
rankedBlockIds.resize(rankSize_);
const auto allIds = GetBlockIds(seqId);
if (allIds.empty() || allIds[0].empty()) {
return;
}
const auto &ids = allIds[0];
for (size_t r = 0; r < rankSize_; ++r) {
rankedBlockIds[r].push_back(ids[0]);
}
}
std::vector<std::vector<HashValue>> RequestSingleBlockManager::GetRankedHashValues(SequenceId seqId) const {
(void)seqId;
return {};
}
std::vector<HashValue> RequestSingleBlockManager::GetSeqHashValues(SequenceId seqId) const {
(void)seqId;
return {};
}
std::vector<size_t> RequestSingleBlockManager::GetTokenCountPerRank(SequenceId seqId) const {
(void)seqId;
return std::vector<size_t>(rankSize_, 0);
}
size_t RequestSingleBlockManager::GetLatestAppendedRankId(SequenceId seqId) const {
(void)seqId;
return 0;
}
size_t RequestSingleBlockManager::GetAppendedBlockRankId(SequenceId seqId) const {
(void)seqId;
return 0;
}
bool RequestSingleBlockManager::IsAppendBlock(SequenceId seqId) {
(void)seqId;
return false;
}
size_t RequestSingleBlockManager::GetNumFreeNpuBlocks() const {
return blockAllocator_->GetNumFreeBlock(DeviceType::NPU, 0);
}
size_t RequestSingleBlockManager::GetNumFreeCpuBlocks() const {
return blockAllocator_->GetNumFreeBlock(DeviceType::CPU);
}
void RequestSingleBlockManager::AccessAllblocksInSeq(const SequenceSPtr &seq, float accessTime) {
(void)seq;
(void)accessTime;
}
std::vector<BlockId> RequestSingleBlockManager::GetCommonComputedBlockIds(const std::vector<SequenceSPtr> &seqs) {
(void)seqs;
return {};
}
std::vector<size_t> RequestSingleBlockManager::GetAllrankComputedBlockNum(const std::vector<SequenceSPtr> &seqs) {
(void)seqs;
return {};
}
std::vector<BlockId> RequestSingleBlockManager::GetRemoteComputedBlockIds(const std::vector<SequenceSPtr> &seqs,
size_t computedLens, uint32_t tpSize,
std::string modelName) {
(void)seqs;
(void)computedLens;
(void)tpSize;
(void)modelName;
return {};
}
std::vector<size_t> RequestSingleBlockManager::GetAllRankRemoteComputedBlockIds(const std::vector<SequenceSPtr> &seqs,
std::vector<size_t> &computedBlocksNum,
std::string modelName) {
(void)seqs;
(void)computedBlocksNum;
(void)modelName;
return {};
}
void RequestSingleBlockManager::MarkBlocksAsComputed() {
}
float RequestSingleBlockManager::GetPrefixCacheHitRate() const { return blockAllocator_->GetPrefixCacheHitRate(); }
bool RequestSingleBlockManager::ResetPrefixCache() const { return blockAllocator_->ResetPrefixCache(); }
size_t RequestSingleBlockManager::GetNumCachedTokens(const SequenceSPtr &seq) {
(void)seq;
return 0;
}
size_t RequestSingleBlockManager::GetSeqNumCachedTokens(const SequenceSPtr &seq) {
(void)seq;
return 0;
}
void RequestSingleBlockManager::ReplaceTrailingPlaceHolder(const SequenceSPtr &seq, size_t trailingPlaceHolderNum,
size_t replacedPlaceHolderNum) {
(void)seq;
(void)trailingPlaceHolderNum;
(void)replacedPlaceHolderNum;
}
void RequestSingleBlockManager::LwdInitCloudBlockManager(const BlockManagerConfig &lwdCloudConfig, size_t localDPRank) {
(void)lwdCloudConfig;
(void)localDPRank;
}
void RequestSingleBlockManager::LwdGetCloudRankedBlockIds(SequenceId seqId,
std::vector<std::vector<BlockId>> &rankedBlockIds) const {
(void)seqId;
rankedBlockIds.clear();
}
size_t RequestSingleBlockManager::LwdGetCloudLatestAppendedRankId(SequenceId seqId) const {
(void)seqId;
return 0;
}
size_t RequestSingleBlockManager::LwdGetCloudAppendedBlockRankId(SequenceId seqId) const {
(void)seqId;
return 0;
}
std::vector<size_t> RequestSingleBlockManager::LwdGetCloudTokenCountPerRank(SequenceId seqId) const {
(void)seqId;
return {};
}
}