* Copyright (c) Huawei Technologies Co., Ltd. 2025-2026. All rights reserved.
* MindIE is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
#ifndef MINDIE_LLM_RESPONSE_H
#define MINDIE_LLM_RESPONSE_H
#include <memory>
#include <vector>
#include "basic_types.h"
#include "request_id.h"
namespace mindie_llm {
enum class InferStatusType {
ITERATION_CONTINUE = 0,
END_OF_SENTENCE = 1,
ABORT = 2,
EXECUTE_ERROR = 3,
ILLEGAL_INPUT = 4,
REACH_MAX_SEQ_LEN = 5,
REACH_MAX_OUTPUT_LEN = 6,
PULL_KV_ERROR = 7,
RELEASE_KV_COMPLETE = 8,
};
enum class TransferStatusType {
NOT_TRANSFER = 0,
PUBLISH_KV_COMPLETE = 1,
PULL_KV_COMPLETE = 2,
RECOMPUTED_TRIGGERED = 3,
PREFILL_COMPLETE = 4,
};
struct ResponseContent {
SequenceId seqId = 0;
SequenceId parentSeqId = 0;
InferStatusType finishReason = InferStatusType::ITERATION_CONTINUE;
size_t speculativeTokenNum = 0;
std::vector<TokenId> outTokenIds{};
std::vector<float> outLogProbs{};
double cumLogProb = 0.0;
int64_t truncationIndex = 0;
std::vector<TokenId> topLogProbTokenIds{};
std::vector<float> topLogProbs{};
std::vector<std::vector<int64_t>> srcBlockTable{};
InstanceId singleLLMPrefillReqHandlerId = 0;
int64_t pdErrorCode = 0;
bool isThinking = false;
};
struct MetricsToResponse {
size_t batchSize = 0;
uint64_t queueWaitTime = 0;
uint64_t prefixCachedTokenNum = 0;
};
struct Response {
explicit Response(const RequestIdNew &reqId) : reqId(reqId) {}
RequestIdNew reqId;
std::vector<ResponseContent> responseContents = {};
MetricsToResponse metrics;
bool isEos{false};
InferStatusType inferStatusFlag =
InferStatusType::ITERATION_CONTINUE;
TransferStatusType transferStatusFlag =
TransferStatusType::NOT_TRANSFER;
uint32_t iterTimes{0U};
size_t numParallelTokens = 0;
};
using ResponseSPtr = std::shared_ptr<Response>;
}
#endif