#!/bin/bash
set -euo pipefail
: '
LoCoMo 完整评测脚本(locomo10.json 全部 10 个 samples)
用法:
./run_eval_full.sh --import-mode ov # OV 直接导入 → QA → Judge → Stat
./run_eval_full.sh --import-mode claw # OpenClaw 会话导入 → QA → Judge → Stat
./run_eval_full.sh --import-mode both # OV + OpenClaw 并行导入 → QA → Judge → Stat
./run_eval_full.sh --import-mode skip # 跳过导入 → QA → Judge → Stat
可选参数:
--run-id NAME 运行标识(默认: 时间戳),用于结果目录命名
--sample N 只跑指定 sample(默认: 全部 10 个)
--force-ingest 强制重新导入(忽略已有记录)
--qa-parallel N QA 并发数(默认: 15)
--judge-parallel N Judge 并发数(默认: 20)
--gateway-url URL OpenClaw Gateway 地址(默认: http://127.0.0.1:19789)
--gateway-token TOKEN Gateway 认证 Token
--ov-url URL OpenViking 服务地址(默认: http://127.0.0.1:2934)
--skip-judge 跳过 Judge 评分
--skip-stat 跳过统计
'
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DATA_FILE="$SCRIPT_DIR/../data/locomo10.json"
SAMPLE_ARG=""
DATASET_TAG="full"
IMPORT_MODE=""
RUN_ID="$(date +%Y%m%d_%H%M%S)"
FORCE_INGEST=""
QA_PARALLEL=15
JUDGE_PARALLEL=20
GATEWAY_URL="http://127.0.0.1:19789"
GATEWAY_TOKEN="${OPENCLAW_GATEWAY_TOKEN:-}"
OV_URL="http://127.0.0.1:2934"
SKIP_JUDGE=false
SKIP_STAT=false
while [[ $# -gt 0 ]]; do
case $1 in
--import-mode)
IMPORT_MODE="$2"; shift 2 ;;
--run-id)
RUN_ID="$2"; shift 2 ;;
--sample)
SAMPLE_ARG="--sample $2"
DATASET_TAG="full_sample$2"
shift 2 ;;
--force-ingest)
FORCE_INGEST="--force-ingest"; shift ;;
--qa-parallel)
QA_PARALLEL="$2"; shift 2 ;;
--judge-parallel)
JUDGE_PARALLEL="$2"; shift 2 ;;
--gateway-url)
GATEWAY_URL="$2"; shift 2 ;;
--gateway-token)
GATEWAY_TOKEN="$2"; shift 2 ;;
--ov-url)
OV_URL="$2"; shift 2 ;;
--skip-judge)
SKIP_JUDGE=true; shift ;;
--skip-stat)
SKIP_STAT=true; shift ;;
*)
echo "未知参数: $1"; exit 1 ;;
esac
done
if [[ -z "$IMPORT_MODE" ]]; then
echo "错误: 必须指定 --import-mode (ov|claw|both|skip)"
echo "用法: $0 --import-mode ov|claw|both|skip [选项]"
exit 1
fi
RESULT_DIR="$SCRIPT_DIR/result/${DATASET_TAG}_${RUN_ID}"
mkdir -p "$RESULT_DIR"
OUTPUT_CSV="$RESULT_DIR/qa_results.csv"
echo "============================================"
echo " LoCoMo 评测: $DATASET_TAG"
echo " 数据集: locomo10.json (2.8MB, 10 samples)"
echo " 导入模式: $IMPORT_MODE"
echo " 运行标识: $RUN_ID"
echo " 结果目录: $RESULT_DIR"
if [[ -n "$SAMPLE_ARG" ]]; then
echo " 指定 sample: $SAMPLE_ARG"
fi
echo "============================================"
case $IMPORT_MODE in
ov)
echo "[1/4] OV 直接导入..."
python3 "$SCRIPT_DIR/import_to_ov.py" \
--input "$DATA_FILE" \
--openviking-url "$OV_URL" \
--no-user-agent-id \
--success-csv "$RESULT_DIR/import_success.csv" \
--error-log "$RESULT_DIR/import_errors.log" \
$SAMPLE_ARG $FORCE_INGEST \
2>&1 | tee "$RESULT_DIR/import_ov.log"
echo "导入完成,等待 120 秒让 OV 处理(全量数据较大)..."
sleep 120
;;
claw)
echo "[1/4] OpenClaw 会话导入..."
python3 "$SCRIPT_DIR/eval.py" ingest "$DATA_FILE" \
--base-url "$GATEWAY_URL" \
--token "$GATEWAY_TOKEN" \
--memory-mode openviking \
$SAMPLE_ARG $FORCE_INGEST \
2>&1 | tee "$RESULT_DIR/import_claw.log"
echo "导入完成,等待 180 秒让记忆稳定(全量数据较大)..."
sleep 180
;;
both)
echo "[1/4] OV + OpenClaw 并行导入..."
python3 "$SCRIPT_DIR/import_to_ov.py" \
--input "$DATA_FILE" \
--openviking-url "$OV_URL" \
--no-user-agent-id \
--success-csv "$RESULT_DIR/import_success.csv" \
--error-log "$RESULT_DIR/import_errors.log" \
$SAMPLE_ARG $FORCE_INGEST \
> "$RESULT_DIR/import_ov.log" 2>&1 &
PID_OV=$!
python3 "$SCRIPT_DIR/eval.py" ingest "$DATA_FILE" \
--base-url "$GATEWAY_URL" \
--token "$GATEWAY_TOKEN" \
--memory-mode openviking \
$SAMPLE_ARG $FORCE_INGEST \
> "$RESULT_DIR/import_claw.log" 2>&1 &
PID_CLAW=$!
echo "等待 OV 导入 (PID=$PID_OV) 和 OpenClaw 导入 (PID=$PID_CLAW) 完成..."
wait $PID_OV $PID_CLAW
echo "导入完成,等待 180 秒让记忆稳定(全量数据较大)..."
sleep 180
;;
skip)
echo "[1/4] 跳过导入"
;;
*)
echo "错误: 无效的 import-mode '$IMPORT_MODE',可选: ov|claw|both|skip"
exit 1
;;
esac
echo "[2/4] 运行 QA 评估 (并发=$QA_PARALLEL)..."
python3 "$SCRIPT_DIR/eval.py" qa "$DATA_FILE" \
--base-url "$GATEWAY_URL" \
--token "$GATEWAY_TOKEN" \
--ov-api-url "$OV_URL" \
$SAMPLE_ARG \
--parallel "$QA_PARALLEL" \
--output "${OUTPUT_CSV%.csv}" \
2>&1 | tee "$RESULT_DIR/qa.log"
if [[ "$SKIP_JUDGE" == false ]]; then
echo "[3/4] Judge 评分 (并发=$JUDGE_PARALLEL)..."
python3 "$SCRIPT_DIR/judge.py" \
--input "$OUTPUT_CSV" \
--parallel "$JUDGE_PARALLEL" \
2>&1 | tee "$RESULT_DIR/judge.log"
else
echo "[3/4] 跳过 Judge 评分"
fi
if [[ "$SKIP_STAT" == false ]]; then
echo "[4/4] 统计结果..."
python3 "$SCRIPT_DIR/stat_judge_result.py" \
--input "$OUTPUT_CSV" \
--import-csv "$RESULT_DIR/import_success.csv" \
--test-scenario "$DATASET_TAG" \
2>&1 | tee "$RESULT_DIR/stat.log"
else
echo "[4/4] 跳过统计"
fi
echo ""
echo "============================================"
echo " 评测完成: $DATASET_TAG"
echo " 结果目录: $RESULT_DIR"
echo "============================================"