#!/bin/bash
RETURN_THOUGHT_TO_LLM="true"
USE_BROWSER_PROCESSOR="true"
USE_CONTEXT_MANAGER="false"
PASS_K=8
TEMPERATURE=1
TOP_P=1
PRESENCE_PENALTY=0
MAX_TOKENS=16384
PROVIDER="openai"
MODEL_NAME=""
RESULT_DIR_NAME=""
BASE_URL=""
API_KEY=""
PROCESSOR_PROVIDER="openai"
PROCESSOR_MODEL_NAME=""
PROCESSOR_BASE_URL=""
PROCESSOR_API_KEY="dada"
MANAGER_URL=""
EVALUATION_ROOT_DIR=""
FILES_DIR=""
MAX_SAMPLES=-1
NUM_PROCESSES=120
MAX_INTERACTIONS=200
TOOL_START_TAG="<tool_call>"
TOOL_END_TAG="</tool_call>"
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
BENCHMARK_NAME=$(basename "$SCRIPT_DIR")
INPUT_FILE="$SCRIPT_DIR/$BENCHMARK_NAME.jsonl"
RAW_OUTPUT_DIR="$EVALUATION_ROOT_DIR/_temp_raw_outputs/${BENCHMARK_NAME}_${RESULT_DIR_NAME}"
echo "========================================="
echo "Starting Benchmark Evaluation and Analysis (Dual Model + Context Manager)"
echo "========================================="
echo " - Benchmark: $BENCHMARK_NAME"
echo " --- Main Model ---"
echo " - Model Name (API): $MODEL_NAME"
echo " - Result Identifier: $RESULT_DIR_NAME"
echo " - API Address: $BASE_URL"
echo " --- Processor Model ---"
echo " - Model Name (API): $PROCESSOR_MODEL_NAME"
echo " - API Address: $PROCESSOR_BASE_URL"
echo " -----------------------------------------"
echo " - Tool Server: $MANAGER_URL"
echo " - Evaluation Results Root Directory: $EVALUATION_ROOT_DIR"
echo "-----------------------------------------"
USE_BROWSER_PROCESSOR_FLAG=""
if [[ "$USE_BROWSER_PROCESSOR" == "true" ]]; then
USE_BROWSER_PROCESSOR_FLAG="--use-browser-processor"
echo " - Browser Processor: Enabled"
else
echo " - Browser Processor: Disabled"
fi
RETURN_THOUGHT_FLAG=""
if [[ "$RETURN_THOUGHT_TO_LLM" == "true" ]]; then
RETURN_THOUGHT_FLAG="--return-thought"
echo " - Return Thought Process: Enabled"
else
echo " - Return Thought Process: Disabled"
fi
USE_CONTEXT_MANAGER_FLAG=""
if [[ "$USE_CONTEXT_MANAGER" == "true" ]]; then
USE_CONTEXT_MANAGER_FLAG="--use-context-manager"
echo " - Context Manager: Enabled"
else
echo " - Context Manager: Disabled"
fi
LLM_JUDGE_ARGS=()
if [[ "$USE_LLM_JUDGE" == "true" ]]; then
LLM_JUDGE_ARGS+=( "--llm-judge" )
LLM_JUDGE_ARGS+=( "--llm-judge-api-key" "$LLM_JUDGE_API_KEY" )
LLM_JUDGE_ARGS+=( "--llm-judge-api-base" "$LLM_JUDGE_API_BASE" )
echo " - LLM as Judge: [!!! ENABLED !!!]"
echo " - LLM Judge API: $LLM_JUDGE_API_BASE"
else
echo " - LLM as Judge: [Disabled]"
fi
echo "-----------------------------------------"
export MONGO_URI=""
export MONGO_DB=""
SAFE_MODEL_NAME=$(echo "$RESULT_DIR_NAME" | tr '/: ' '___')
export MONGO_COL="${BENCHMARK_NAME}_${SAFE_MODEL_NAME}"
export MONGO_CREATE_INDEX="0"
echo "[preflight] env mongo vars:"
env | grep -E '^MONGO_' || echo "[preflight] (none)"
python - <<'PY'
import os
keys = ["MONGO_URI","MONGO_DB","MONGO_COL","MONGO_CREATE_INDEX"]
print("[preflight python]", {k: (os.getenv(k)[:40]+"..." if os.getenv(k) and k=="MONGO_URI" else os.getenv(k)) for k in keys})
PY
python "$SCRIPT_DIR/../../run_evaluation.py" \
--provider "$PROVIDER" \
--model "$MODEL_NAME" \
--model-name "$RESULT_DIR_NAME" \
--base-url "$BASE_URL" \
--api-key "$API_KEY" \
--processor-provider "$PROCESSOR_PROVIDER" \
--processor-model "$PROCESSOR_MODEL_NAME" \
--processor-base-url "$PROCESSOR_BASE_URL" \
--processor-api-key "$PROCESSOR_API_KEY" \
--benchmark-name "$BENCHMARK_NAME" \
--manager-url "$MANAGER_URL" \
--input-file "$INPUT_FILE" \
--output-dir "$RAW_OUTPUT_DIR" \
--output-base-dir "$EVALUATION_ROOT_DIR" \
--max-samples "$MAX_SAMPLES" \
--num-processes "$NUM_PROCESSES" \
--max-interactions "$MAX_INTERACTIONS" \
--tool-start-tag "$TOOL_START_TAG" \
--tool-end-tag "$TOOL_END_TAG" \
--files-dir "$FILES_DIR" \
$USE_BROWSER_PROCESSOR_FLAG \
$RETURN_THOUGHT_FLAG \
$USE_CONTEXT_MANAGER_FLAG \
"${LLM_JUDGE_ARGS[@]}" \
--k "$PASS_K" \
--temperature "$TEMPERATURE" \
--top-p "$TOP_P" \
--presence-penalty "$PRESENCE_PENALTY" \
--max-tokens "$MAX_TOKENS"
echo "========================================="
echo "All tasks completed."
echo "========================================="
read -p "Press any key to continue..."