MindIE-LLM/src/include/request_response/response.h-代码预览-MindIE-LLM:基于昇腾硬件的大语言模型推理加速套件 - AtomGit

/**
 * Copyright (c) Huawei Technologies Co., Ltd. 2025-2026. All rights reserved.
 * MindIE is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *          http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

#ifndef MINDIE_LLM_RESPONSE_H
#define MINDIE_LLM_RESPONSE_H

#include <memory>
#include <vector>

#include "basic_types.h"
#include "request_id.h"

namespace mindie_llm {

// enum for Response.inferStatusFlag
enum class InferStatusType {
    ITERATION_CONTINUE = 0,  // 请求继续迭代执行
    END_OF_SENTENCE = 1,     // 请求正常结束
    ABORT = 2,               // 请求被主动CANCEL或STOP，用户不感知，丢弃响应
    EXECUTE_ERROR = 3,       // 请求执行中出错，响应输出为空，err_msg非空
    ILLEGAL_INPUT = 4,       // 请求输入校验异常，响应输出为空，err_msg非空
    REACH_MAX_SEQ_LEN = 5,   // 请求因达到最大序列长度而结束，响应为最后一轮迭代输出
    REACH_MAX_OUTPUT_LEN = 6,  // 请求因达到最大输出长度（包括请求和模型粒度）而结束，响应为最后一轮迭代输出
    PULL_KV_ERROR = 7,        // 请求因pull kv失败而结束，响应为最后一轮迭代输出
    RELEASE_KV_COMPLETE = 8,  // 请求释放kv cache
};

// enum for Response.transferStatusFlag
enum class TransferStatusType {
    NOT_TRANSFER = 0,
    PUBLISH_KV_COMPLETE = 1,   // KV Cache is ready for publish on P instance, notify D instance to pull it
    PULL_KV_COMPLETE = 2,      // D instance has completed pulling KV Cache
    RECOMPUTED_TRIGGERED = 3,  // recompute is triggered
    PREFILL_COMPLETE = 4,      // P instance has completed prefill, response the results
};

// response contents for a single sequence
struct ResponseContent {
    SequenceId seqId = 0;        // id of this sequence
    SequenceId parentSeqId = 0;  // seqId of the parent of this sequence in Parallel Sampling
    InferStatusType finishReason = InferStatusType::ITERATION_CONTINUE;
    size_t speculativeTokenNum = 0;      // to truncate tokens, in case there are redundant items
    std::vector<TokenId> outTokenIds{};  // a list of tokens generated by Speculative Decoding
    std::vector<float> outLogProbs{};    // their corresponding logProb values
    double cumLogProb = 0.0;
    int64_t truncationIndex = 0;
    // the size of following items are the same as top_logprobs (an input parameter of user request)
    std::vector<TokenId> topLogProbTokenIds{};  // a list of tokens with highest logProbs
    std::vector<float> topLogProbs{};           // their corresponding logProbs values
    // the following items are used for PD disaggregation
    std::vector<std::vector<int64_t>> srcBlockTable{};  // block table of prefill instance
    InstanceId singleLLMPrefillReqHandlerId = 0;        // instance id of prefill instance
    int64_t pdErrorCode = 0;
    bool isThinking = false;
};

// metrics data for server.endpoint.endpoint_def.Metrics
struct MetricsToResponse {
    size_t batchSize = 0;
    uint64_t queueWaitTime = 0;
    uint64_t prefixCachedTokenNum = 0;
};

struct Response {
    explicit Response(const RequestIdNew &reqId) : reqId(reqId) {}
    RequestIdNew reqId;
    std::vector<ResponseContent> responseContents = {};  // vector is used for the compatibility with Parallel Sampling
    MetricsToResponse metrics;
    bool isEos{false};
    InferStatusType inferStatusFlag =
        InferStatusType::ITERATION_CONTINUE;  // denotes infer status, refer to InferStatusType
    TransferStatusType transferStatusFlag =
        TransferStatusType::NOT_TRANSFER;  // denotes PD transfer status, refer to TransferStatusType
    uint32_t iterTimes{0U};                // inference iterations that this request has gone through
    size_t numParallelTokens = 0;
};

using ResponseSPtr = std::shared_ptr<Response>;
}  // namespace mindie_llm
#endif  // MINDIE_LLM_RESPONSE_H