syntax="proto3";
import "google/protobuf/empty.proto"; // 导入 google.protobuf.Empty
option java_multiple_files=true;
option java_package="io.grpc.example.grpcMsg";
option java_outer_classname="grpcMsgProto";
option objc_class_prefix="HLW";

package prefillAndDecodeCommunication;

service DecodeService {
    rpc DecodeRequestChannel(DecodeParameters) returns (DecodeRequestResponse) {}
}

service PrefillService {
    rpc ReleaseKVCacheChannel(RequestId) returns (google.protobuf.Empty) {}
}

message DecodeRequestResponse {
    bool isValidDecodeParameters = 1;
    string errorMessage = 2;
}

message BlockIds {
    repeated int64 blockId = 1;
}

message DecodeParameters {
    string reqId = 1;
    repeated int64 tokens = 2;
    repeated int64 firstToken = 3; // prefill 可能做复数次推理
    uint32 msgType = 4; // restful接口类型

    SamplingParams samplingParams = 5; // inferParam.h
    InferParams inferParams = 6; // 可选参数,统一使用单独结构体
    InputId inputId = 7; // 可选参数,统一使用单独结构体
    TextInput textInput = 8;
    bool isStream = 9;
    int32 maxNewToken = 10;
    bool details = 11;
    reserved 12; // perfStat - deprecated and removed
    bool returnFullText = 13; // tgi
    bool decoderInputDetails = 14; //tgi
    int32 batchSize = 15;
    string id = 16; // triton
    uint32 truncate = 17;
    string tools = 18;
    string toolChoice = 19;
    string loraId = 20;
    string modelName = 21;
    Metrics metrics = 22;
    repeated string outputNames = 23;
    uint32 prevdecodeindex = 24;
    uint32 currentdecodeindex = 25;

    repeated BlockIds blockTable = 26;
    string pNodeAddr = 27;
    uint32 pInstanceId = 28;
    uint64 e2eStartTime = 29;
    string postsingletext = 30;
    repeated uint64 dpInstanceIds = 31;
    bool useToolCall = 32;
    uint32 prefillTokenNum = 33;
    uint32 preOutputTokenNum = 34;
}

message SamplingParams {
    Temperature temperature = 1; // 可选参数,统一使用单独结构体
    TopK topK = 2;
    TopP topP = 3;
    TypicalP typicalP = 4;
    DoSample doSample = 5;
    Seed seed = 6;
    RepetitionPenalty repetitionPenalty = 7;
    Watermark watermark = 8;
    FrequencyPenalty frequencyPenalty = 9;
    PresencyPenalty presencyPenalty = 10;
    LengthPenalty lengthPenalty = 11; // 此参数已废弃
    StopTokenIds stopTokenIds = 12;
    StopStrings stopStrings = 13;
    SkipSpecialTokens skipSpecialTokens = 14;
    IncludeStopStrInOutput includeStopStrInOutput = 15;
    IgnoreEos ignoreEos = 16;
    Logprobs logprobs = 17;
    TopLogprobs topLogprobs = 18;
    EnableThinking enableThinking = 19;
    ThinkingBudget thinkingBudget = 20;
    optional string responseFormat = 21;
    IsThinking isThinking = 22;
}
message Temperature {
    optional float value = 1;
}

message TopK {
    optional uint32 value = 1;
}

message TopP {
    optional float value = 1;
}

message TypicalP {
    optional float value = 1;
}

message DoSample {
    optional bool value = 1;
}

message Seed {
    optional uint64 value = 1;
}

message RepetitionPenalty {
    optional float value = 1;
}

message Watermark {
    optional bool value = 1;
}

message FrequencyPenalty {
    optional float value = 1;
}

message PresencyPenalty {
    optional float value = 1;
}

message LengthPenalty {
    optional float value = 1;
}

message StopTokenIds {
    repeated int64 value = 1;
}

message StopStrings {
    optional string value = 1;
    repeated string list = 2;
}

message SkipSpecialTokens {
    optional bool value = 1;
}

message IncludeStopStrInOutput {
    optional bool value = 1;
}

message IgnoreEos {
    optional bool value = 1;
}

message Logprobs {
    optional bool value = 1;
}

message TopLogprobs {
    optional uint64 value = 1;
}

message EnableThinking {
    optional bool value = 1;
}

message ThinkingBudget {
    optional uint32 value = 1;
}

message IsThinking {
    optional bool value = 1;
}

message InferParams {
    uint64 priority = 1;
    uint64 timeout = 2;
}

message InputId {
    optional string value = 1;
}

message TextInput {
    optional string value = 1;
}

message Metrics {
    uint64 firstTokenCost = 1;
    uint64 lastTokenCost = 2;
    repeated uint64 decodeTime = 3;
    repeated int64 batchSize = 4;
    repeated int64 queueWaitTime = 5;
    repeated int64 prefixCachedTokenNums = 6;
    uint64 callbackIndex = 7;
}

message RequestId {
    string reqId = 1;
}