syntax="proto3";
import "google/protobuf/empty.proto";
option java_multiple_files=true;
option java_package="io.grpc.example.grpcMsg";
option java_outer_classname="grpcMsgProto";
option objc_class_prefix="HLW";
package prefillAndDecodeCommunication;
service DecodeService {
rpc DecodeRequestChannel(DecodeParameters) returns (DecodeRequestResponse) {}
}
service PrefillService {
rpc ReleaseKVCacheChannel(RequestId) returns (google.protobuf.Empty) {}
}
message DecodeRequestResponse {
bool isValidDecodeParameters = 1;
string errorMessage = 2;
}
message BlockIds {
repeated int64 blockId = 1;
}
message DecodeParameters {
string reqId = 1;
repeated int64 tokens = 2;
repeated int64 firstToken = 3;
uint32 msgType = 4;
SamplingParams samplingParams = 5;
InferParams inferParams = 6;
InputId inputId = 7;
TextInput textInput = 8;
bool isStream = 9;
int32 maxNewToken = 10;
bool details = 11;
reserved 12;
bool returnFullText = 13;
bool decoderInputDetails = 14;
int32 batchSize = 15;
string id = 16;
uint32 truncate = 17;
string tools = 18;
string toolChoice = 19;
string loraId = 20;
string modelName = 21;
Metrics metrics = 22;
repeated string outputNames = 23;
uint32 prevdecodeindex = 24;
uint32 currentdecodeindex = 25;
repeated BlockIds blockTable = 26;
string pNodeAddr = 27;
uint32 pInstanceId = 28;
uint64 e2eStartTime = 29;
string postsingletext = 30;
repeated uint64 dpInstanceIds = 31;
bool useToolCall = 32;
uint32 prefillTokenNum = 33;
uint32 preOutputTokenNum = 34;
}
message SamplingParams {
Temperature temperature = 1;
TopK topK = 2;
TopP topP = 3;
TypicalP typicalP = 4;
DoSample doSample = 5;
Seed seed = 6;
RepetitionPenalty repetitionPenalty = 7;
Watermark watermark = 8;
FrequencyPenalty frequencyPenalty = 9;
PresencyPenalty presencyPenalty = 10;
LengthPenalty lengthPenalty = 11;
StopTokenIds stopTokenIds = 12;
StopStrings stopStrings = 13;
SkipSpecialTokens skipSpecialTokens = 14;
IncludeStopStrInOutput includeStopStrInOutput = 15;
IgnoreEos ignoreEos = 16;
Logprobs logprobs = 17;
TopLogprobs topLogprobs = 18;
EnableThinking enableThinking = 19;
ThinkingBudget thinkingBudget = 20;
optional string responseFormat = 21;
IsThinking isThinking = 22;
}
message Temperature {
optional float value = 1;
}
message TopK {
optional uint32 value = 1;
}
message TopP {
optional float value = 1;
}
message TypicalP {
optional float value = 1;
}
message DoSample {
optional bool value = 1;
}
message Seed {
optional uint64 value = 1;
}
message RepetitionPenalty {
optional float value = 1;
}
message Watermark {
optional bool value = 1;
}
message FrequencyPenalty {
optional float value = 1;
}
message PresencyPenalty {
optional float value = 1;
}
message LengthPenalty {
optional float value = 1;
}
message StopTokenIds {
repeated int64 value = 1;
}
message StopStrings {
optional string value = 1;
repeated string list = 2;
}
message SkipSpecialTokens {
optional bool value = 1;
}
message IncludeStopStrInOutput {
optional bool value = 1;
}
message IgnoreEos {
optional bool value = 1;
}
message Logprobs {
optional bool value = 1;
}
message TopLogprobs {
optional uint64 value = 1;
}
message EnableThinking {
optional bool value = 1;
}
message ThinkingBudget {
optional uint32 value = 1;
}
message IsThinking {
optional bool value = 1;
}
message InferParams {
uint64 priority = 1;
uint64 timeout = 2;
}
message InputId {
optional string value = 1;
}
message TextInput {
optional string value = 1;
}
message Metrics {
uint64 firstTokenCost = 1;
uint64 lastTokenCost = 2;
repeated uint64 decodeTime = 3;
repeated int64 batchSize = 4;
repeated int64 queueWaitTime = 5;
repeated int64 prefixCachedTokenNums = 6;
uint64 callbackIndex = 7;
}
message RequestId {
string reqId = 1;
}