latencyPredictor:
enabled: false
trainingServer:
image:
registry: registry.k8s.io
repository: gateway-api-inference-extension/latency-training-server
tag: v1.5.0
pullPolicy: Always
port: 8000
resources:
requests:
cpu: "2000m"
memory: "4Gi"
limits:
cpu: "4000m"
memory: "8Gi"
livenessProbe:
httpGet:
path: /healthz
port: 8000
initialDelaySeconds: 30
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8000
initialDelaySeconds: 45
periodSeconds: 10
volumeSize: "20Gi"
config:
LATENCY_RETRAINING_INTERVAL_SEC: "10"
LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
LATENCY_MODEL_TYPE: "xgboost"
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "500"
LATENCY_OBJECTIVE_TYPE: "mean"
predictionServers:
count: 1
startPort: 8001
image:
registry: registry.k8s.io
repository: gateway-api-inference-extension/latency-prediction-server
tag: v1.5.0
pullPolicy: Always
resources:
requests:
cpu: "8000m"
memory: "4Gi"
limits:
cpu: "28000m"
memory: "8Gi"
livenessProbe:
httpGet:
path: /healthz
initialDelaySeconds: 15
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 5
readinessProbe:
httpGet:
path: /readyz
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 30
volumeSize: "10Gi"
config:
LATENCY_MODEL_TYPE: "xgboost"
PREDICT_HOST: "0.0.0.0"
LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
UVICORN_WORKERS: "28"
OMP_NUM_THREADS: "1"
MODEL_SYNC_INTERVAL_SEC: "30"
LATENCY_OBJECTIVE_TYPE: "mean"
eppEnv:
LATENCY_MAX_SAMPLE_SIZE: "10000"
LATENCY_MAX_CONCURRENT_DISPATCHES: "36"
LATENCY_COALESCE_WINDOW_MS: "1"