* Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
* MindIE is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
#ifndef SEQUENCE_GROUP_H
#define SEQUENCE_GROUP_H
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "basic_types.h"
#include "concurrent_map.h"
#include "request_response/request_id.h"
#include "sampling.h"
#include "sequence.h"
namespace mindie_llm {
constexpr std::chrono::high_resolution_clock::time_point INVALID_TIME =
std::chrono::high_resolution_clock::time_point::min();
struct RequestMetrics {
RequestIdNew inferReqId_;
time_t arrivalTime_{};
TimeStamp firstTokenTime_{};
uint64_t queueWaitTime_{0};
std::chrono::time_point<std::chrono::high_resolution_clock> responseTime_{INVALID_TIME};
uint64_t prefixCachedTokenNum_{0};
};
using RequestMetricsSPtr = std::shared_ptr<RequestMetrics>;
struct SequenceGroup {
RequestId requestId;
std::vector<SequenceSPtr> seqs_;
SequenceSPtr firstSeq;
SamplingParamsSPtr sampling;
RequestMetrics metrics_;
std::chrono::time_point<std::chrono::high_resolution_clock> arriveTime;
std::chrono::time_point<std::chrono::high_resolution_clock> lastCompletionTime;
uint64_t iterTimes{0};
uint64_t priority_{0};
uint64_t maxOutputLen_{1024};
bool isSynchronous_{false};
std::vector<BlockIds> pBlockTable{};
InstanceId pInstanceId = 0;
uint64_t dpInstanceId_ = 0;
std::atomic<bool> isKVPulled{false};
uint64_t maxIterTimes_ = 0;
std::optional<bool> skipSpecialTokens_;
std::optional<bool> ignoreEos_;
ConcurrentMap<SequenceId, std::shared_ptr<SequenceGroup>> seqId2ParallelSeqGroup_;
bool isNewSeqGroup_{false};
bool needUpdate_{false};
bool isLastChunk_{false};
SequenceId parentSeqId_{-1};
std::vector<BlockIds> parentBlockIds_{};
std::optional<std::string> loraId_;
std::vector<TokenId> prefillReplayTokenIds_{};
WaveId waveId_{-1};
size_t rankId_ = 0;
SequenceGroup(RequestId &tRequestId, const std::vector<SequenceSPtr> &tSeqs);
SequenceGroup(RequestId &tRequestId, const std::vector<SequenceSPtr> &tSeqs, const SamplingParamsSPtr &tSampling);
SequenceGroup(RequestId &tRequestId, const std::vector<SequenceSPtr> &tSeqs, const SamplingParamsSPtr &tSampling,
const std::optional<std::string> &tLoraId, size_t tRankId);
~SequenceGroup();
std::vector<SequenceSPtr> GetFirstSequence(const SequenceStatus status = SequenceStatus::ALL_STATUS);
std::vector<SequenceSPtr> GetSequences(const SequenceStatus status = {});
std::vector<SequenceSPtr> GetParallelSequences(const SequenceStatus status = SequenceStatus::ALL_STATUS) const;
std::vector<std::shared_ptr<SequenceGroup>> GetParallelSeqGrp();
void UpdateNumComputedTokens(size_t numNewComputedTokens);
[[nodiscard]] int GetMaxNumRunningSeqs() const;
[[nodiscard]] bool IsPrefill() const;
[[nodiscard]] bool IsLayerwisePrefill() const;
[[nodiscard]] bool IsFinished() const;
[[nodiscard]] bool IsSimulateRequest() const;
bool isDecode_{false};
bool isFlexLocal_{false};
int32_t requestGap_{0};
std::chrono::time_point<std::chrono::high_resolution_clock> recomputeArriveTime_;
uint32_t topLogProbs_{0};
bool enableThinking_{false};
uint32_t thinkingBudget_{0};
bool exceededThinkingbudget_{false};
bool isThinking_ = false;
size_t thinkingTokens = 0;
};
using SequenceGroupSPtr = std::shared_ptr<SequenceGroup>;
struct ScheduledSequenceGroup {
SequenceGroupSPtr seqGroup_{};
size_t tokenChunkSize_{};
ScheduledSequenceGroup() = default;
ScheduledSequenceGroup(const SequenceGroupSPtr &tSeqGroup, const size_t tTokenChunkSize,
bool enableChunked = false);
};
using ScheduledSequenceGroupSPtr = std::shared_ptr<ScheduledSequenceGroup>;
enum class ForwardMode : int { PREFILL, DECODE, EXTEND, MIXED, DUMMY };
struct SchedulerOutputs {
std::vector<ScheduledSequenceGroupSPtr> scheduledSeqGroups_;
std::vector<SequenceGroupSPtr> ignoredSeqGroups_;
std::vector<SequenceId> recomputeSeqIds_;
size_t numPrefillGroups_{};
size_t numBatchedTokens_{};
size_t numPreempted_{};
size_t runningQueueSize_{};
std::vector<std::pair<BlockId, BlockId>> blocksToSwapIn_;
std::vector<std::pair<BlockId, BlockId>> blocksToSwapOut_;
std::vector<std::pair<BlockId, BlockId>> blocksToCopy_;
ForwardMode forwardMode_{ForwardMode::DECODE};
bool IsEmpty();
uint32_t curWaitQueueLen_{0};
};
struct SchedulerKVTransferOutput {
std::unordered_set<SequenceId> pulledSeqIds;
std::vector<ScheduledSequenceGroupSPtr> pullSeqGroups;
bool IsEmpty();
};
}
#endif