genui-sdk/packages/benchmarks/src/run-report.ts-代码预览-genui-sdk:基于 Generative UI 的全栈 AI 应用开发工具包 - AtomGit

Yyyfeat: add error handling for streaming results in judgeOneSample function
import fs from 'node:fs';
import { genRootSchema } from '@opentiny/genui-sdk-core';
import { streamText } from 'ai';
import type { ZodIssue } from 'zod';
import type { LlmBenchmarkResultItem, LlmBenchmarkRunOptions, LlmBenchmarkSample } from './framework/index';
import { printLlmBenchmarkResults } from './framework/index';
import {
  computeTpotMs,
  extractSchemaJsonBlock,
  parseJudgeJson,
  resolveAiSdkModelForBench,
  resolvePrimaryBenchmarkModelId,
  resolveSamplesDir,
  resolveStreamTextUsage,
  benchStreamTextAbortSignal,
} from './utils';

/**
 * 递归展开 Zod issue，尽量定位到 union 分支内的最深层错误。
 */
function flattenZodIssues(issues: readonly ZodIssue[]): ZodIssue[] {
  const flattened: ZodIssue[] = [];
  for (const issue of issues) {
    if (issue.code === 'invalid_union') {
      const unionErrors = (issue as ZodIssue & { unionErrors?: Array<{ issues: ZodIssue[] }> }).unionErrors ?? [];
      if (unionErrors.length > 0) {
        for (const unionError of unionErrors) {
          flattened.push(...flattenZodIssues(unionError.issues));
        }
        continue;
      }
    }
    flattened.push(issue);
  }
  return flattened;
}

/**
 * 选择最有定位价值的 issue：优先更深路径，其次非泛化报错文案。
 */
function pickMostSpecificIssue(issues: readonly ZodIssue[]): ZodIssue | undefined {
  const expanded = flattenZodIssues(issues);
  if (expanded.length === 0) return undefined;
  return expanded.slice().sort((a, b) => {
    const pathScoreA = a.path.length * 100;
    const pathScoreB = b.path.length * 100;
    const msgScoreA = a.message === 'Invalid input' ? 0 : 10;
    const msgScoreB = b.message === 'Invalid input' ? 0 : 10;
    const codeScoreA = a.code === 'invalid_type' ? 5 : 0;
    const codeScoreB = b.code === 'invalid_type' ? 5 : 0;
    return pathScoreB + msgScoreB + codeScoreB - (pathScoreA + msgScoreA + codeScoreA);
  })[0];
}

type SchemaJsonValidation = {
  isSchemaJsonBlockFound: boolean;
  isSchemaJsonValidJson: boolean;
  isSchemaJsonValidAgainstProtocol: boolean;
  schemaValidationError?: string;
};

/**
 * 校验 schemaJson：是否存在代码块、块内是否合法 JSON、是否通过协议。
 */
function validateSchemaJson(schemaJsonText: string | null): SchemaJsonValidation {
  if (!schemaJsonText) {
    return {
      isSchemaJsonBlockFound: false,
      isSchemaJsonValidJson: false,
      isSchemaJsonValidAgainstProtocol: false,
      schemaValidationError: 'schemaJson code block not found',
    };
  }

  try {
    const parsed = JSON.parse(schemaJsonText);
    const result = genRootSchema().safeParse(parsed);
    if (result.success) {
      return {
        isSchemaJsonBlockFound: true,
        isSchemaJsonValidJson: true,
        isSchemaJsonValidAgainstProtocol: true,
      };
    }
    const issue = pickMostSpecificIssue(result.error.issues);
    const path = issue?.path?.length ? issue.path.join('.') : '(root)';
    const message = issue
      ? `[${issue.code}] ${issue.message}`
      : `schema safeParse failed (issues=${result.error.issues.length})`;
    return {
      isSchemaJsonBlockFound: true,
      isSchemaJsonValidJson: true,
      isSchemaJsonValidAgainstProtocol: false,
      schemaValidationError: `${path}: ${message}`,
    };
  } catch (error) {
    const detail = error instanceof Error ? error.message : String(error);
    return {
      isSchemaJsonBlockFound: true,
      isSchemaJsonValidJson: false,
      isSchemaJsonValidAgainstProtocol: false,
      schemaValidationError: `schema parse failed: ${detail}`,
    };
  }
}

type LlmJudgeResult = {
  score?: number;
  reason?: string;
  error?: string;
  promptTokens?: number;
  completionTokens?: number;
  totalTokens?: number;
};

/**
 * 使用 LLM-as-a-Judge 对单条样本做质量评估。
 * @param sample 样本数据
 * @param options 运行配置（读取 Judge 模型）
 * @returns Judge 结果（分数与原因）
 */
async function judgeOneSample(sample: LlmBenchmarkSample, options: LlmBenchmarkRunOptions): Promise<LlmJudgeResult> {
  const judgeCfg = options.llmJudge;
  const modelId = judgeCfg?.model || resolvePrimaryBenchmarkModelId(options);
  const system =
    judgeCfg?.systemPrompt ??
    `你是严格的前端代码评测员。请依据 schemaJson 格式规范，基于用户需求与模型输出从三个角度评估生成的UI代码是否具备完成目标任务的实际能力，并给出评分：
    1. 完整性:界面元素完整，无缺失或错误组件；
    2. 功能性:交互逻辑正常，按钮表单响应正确；
    3. 信息充分性:提供完成任务所需的全部关键信息。
    只返回 JSON：{"score":1-10之间数字,"reason":"一句话原因"}。不要输出其它内容。`;
  try {
    const requirementText = sample.messages?.length
      ? sample.messages.map((msg) => `[${msg.role}] ${msg.content}`).join('\n')
      : ((sample as LlmBenchmarkSample & { prompt?: string }).prompt ?? '');
    const modelInstance = await resolveAiSdkModelForBench(modelId);
    const abortSignal = benchStreamTextAbortSignal(options.streamTimeoutMs);
    const streamResult = streamText({
      model: modelInstance,
      temperature: 0,
      system,
      messages: [
        {
          role: 'user',
          content:
            `请评估以下样本。\n` +
            `【场景】${sample.scenario}\n` +
            `【用户需求】\n${requirementText}\n\n` +
            `【模型输出】\n${sample.output}\n`,
        },
      ],
      ...(abortSignal ? { abortSignal } : {}),
    });
    let output = '';
    let promptTokens = 0;
    let completionTokens = 0;
    let totalTokens = 0;
    let streamError: string | undefined;
    for await (const chunk of streamResult.fullStream) {
      if (chunk.type === 'text-delta' && chunk.text) {
        output += chunk.text;
      }
      if (chunk.type === 'finish') {
        const u = chunk.totalUsage;
        promptTokens = u?.inputTokens ?? promptTokens;
        completionTokens = u?.outputTokens ?? completionTokens;
        totalTokens = u?.totalTokens ?? totalTokens;
      }
      if (chunk.type === 'error') {
        streamError = chunk.error instanceof Error ? chunk.error.message : String(chunk.error);
      }
    }
    const settled = await resolveStreamTextUsage(streamResult);
    if (typeof settled.inputTokens === 'number') {
      promptTokens = settled.inputTokens;
    }
    if (typeof settled.outputTokens === 'number') {
      completionTokens = settled.outputTokens;
    }
    if (typeof settled.totalTokens === 'number') {
      totalTokens = settled.totalTokens;
    }
    const usage = {
      promptTokens,
      completionTokens,
      totalTokens,
    };
    if (streamError) {
      return { error: streamError, ...usage };
    }
    const parsed = parseJudgeJson(output);
    if (!parsed || typeof parsed.score !== 'number') {
      return { error: 'Judge output JSON parse failed', ...usage };
    }
    const score = Math.min(10, Math.max(1, parsed.score));
    return {
      score,
      reason: parsed.reason,
      ...usage,
    };
  } catch (error) {
    return { error: error instanceof Error ? error.message : String(error) };
  }
}

/**
 * 将单个样本转为报告结果项。
 * @param sample 由生成阶段写入的样本对象
 * @param judge Judge 结果（可选）
 * @returns 用于汇总/展示的指标结果
 */
function toReportItem(sample: LlmBenchmarkSample, judge?: LlmJudgeResult): LlmBenchmarkResultItem {
  const schemaJsonText = extractSchemaJsonBlock(sample.output);
  const validation = validateSchemaJson(schemaJsonText);
  const ttftMs = typeof sample.metrics.ttftMs === 'number' ? sample.metrics.ttftMs : undefined;
  const tpotMs =
    typeof sample.metrics.tpotMs === 'number'
      ? sample.metrics.tpotMs
      : ttftMs == null
        ? undefined
        : computeTpotMs(ttftMs, sample.metrics.totalMs, sample.metrics.completionTokens);
  const judgeTotal = judge?.totalTokens ?? 0;
  const benchTotalTokens = sample.metrics.totalTokens + judgeTotal;
  return {
    scenario: sample.scenario,
    promptVariant: sample.promptVariant ?? 'full',
    runIndex: sample.runIndex,
    model: sample.model,
    ...(ttftMs != null ? { ttftMs } : {}),
    totalMs: sample.metrics.totalMs,
    ...(typeof sample.metrics.firstObservableComponentMs === 'number'
      ? { firstObservableComponentMs: sample.metrics.firstObservableComponentMs }
      : {}),
    ...(tpotMs !== undefined ? { tpotMs } : {}),
    isSchemaJsonBlockFound: validation.isSchemaJsonBlockFound,
    isSchemaJsonValidJson: validation.isSchemaJsonValidJson,
    isSchemaJsonValidAgainstProtocol: validation.isSchemaJsonValidAgainstProtocol,
    ...(validation.schemaValidationError != null ? { schemaValidationError: validation.schemaValidationError } : {}),
    promptTokens: sample.metrics.promptTokens,
    completionTokens: sample.metrics.completionTokens,
    totalTokens: sample.metrics.totalTokens,
    benchTotalTokens,
    rawOutputChars: sample.metrics.rawOutputChars,
    llmJudgeScore: judge?.score,
    llmJudgeReason: judge?.reason,
    llmJudgeError: judge?.error,
    ...(typeof judge?.promptTokens === 'number' ? { llmJudgePromptTokens: judge.promptTokens } : {}),
    ...(typeof judge?.completionTokens === 'number' ? { llmJudgeCompletionTokens: judge.completionTokens } : {}),
    ...(typeof judge?.totalTokens === 'number' ? { llmJudgeTotalTokens: judge.totalTokens } : {}),
    errorMessage: sample.metrics.errorMessage,
  };
}

/**
 * 读取样本目录并输出统计报告。
 * @param options 运行配置（用于过滤 scenario/scenarios 与 models）
 * @returns 输出打印与写盘后的结果集
 */
export async function runReport(options: LlmBenchmarkRunOptions) {
  const baseDir = resolveSamplesDir(options.samplesDir);
  if (!fs.existsSync(baseDir)) {
    throw new Error(`Samples directory not found: ${baseDir}`);
  }
  const sampleFiles = fs
    .readdirSync(baseDir)
    .filter((name) => name.endsWith('.json') && name !== 'report.json')
    .map((name) => `${baseDir}/${name}`);

  const selectedIds = options.scenarios?.length
    ? new Set(options.scenarios)
    : options.scenario
      ? new Set([options.scenario])
      : undefined;

  const modelSet = options.models?.length ? new Set(options.models) : null;

  const parsedSamples = sampleFiles
    .map((filePath) => JSON.parse(fs.readFileSync(filePath, 'utf-8')) as LlmBenchmarkSample)
    .filter((sample) => !selectedIds || selectedIds.has(sample.scenario))
    .filter((sample) => !modelSet || (sample.model != null && modelSet.has(sample.model)))
    .sort((a, b) => {
      const s = a.scenario.localeCompare(b.scenario);
      if (s !== 0) return s;
      const m = (a.model ?? '').localeCompare(b.model ?? '');
      if (m !== 0) return m;
      const vA = (a.promptVariant ?? 'full') === 'plain' ? 1 : 0;
      const vB = (b.promptVariant ?? 'full') === 'plain' ? 1 : 0;
      if (vA !== vB) return vA - vB;
      return (a.runIndex ?? 1) - (b.runIndex ?? 1);
    });

  const judgeEnabled = options.llmJudge?.enabled === true;
  const judgeResults: Array<LlmJudgeResult | undefined> = [];
  if (judgeEnabled) {
    const toJudgeCount = parsedSamples.filter((s) => (s.promptVariant ?? 'full') !== 'plain').length;
    console.log(
      `[bench][judge] enabled, samples=${parsedSamples.length}, judgeCalls=${toJudgeCount}（纯文本样本跳过 Judge）`,
    );
    const concurrency = Math.max(1, options.concurrency ?? 2);
    let cursor = 0;
    async function worker() {
      while (true) {
        const index = cursor++;
        if (index >= parsedSamples.length) return;
        const sample = parsedSamples[index];
        if ((sample.promptVariant ?? 'full') === 'plain') {
          judgeResults[index] = undefined;
          console.log(`[bench][judge] ${index + 1}/${parsedSamples.length} ${sample.scenario} plain — skip Judge`);
          continue;
        }
        const judged = await judgeOneSample(sample, options);
        judgeResults[index] = judged;
        const score = judged.score == null ? '-' : judged.score.toFixed(2);
        console.log(
          `[bench][judge] ${index + 1}/${parsedSamples.length} ${sample.scenario} score=${score}${judged.error ? ' error' : ''}`,
        );
      }
    }
    await Promise.all(Array.from({ length: Math.min(concurrency, parsedSamples.length) }, () => worker()));
  }

  const results: LlmBenchmarkResultItem[] = parsedSamples.map((sample, index) =>
    toReportItem(sample, judgeEnabled ? judgeResults[index] : undefined),
  );

  if (results.length === 0) {
    throw new Error('No samples matched the current filter');
  }
  const invalidSchemaRows = results.filter((item) => !item.isSchemaJsonValidAgainstProtocol);
  if (invalidSchemaRows.length > 0) {
    console.log(`\nSchema Validation Errors (all ${invalidSchemaRows.length})`);
    console.table(
      invalidSchemaRows.map((item) => ({
        scenario: item.scenario,
        variant: item.promptVariant ?? 'full',
        model: item.model ?? '',
        runIndex: item.runIndex ?? 1,
        schemaError: item.schemaValidationError ?? '',
      })),
    );
  }
  return await printLlmBenchmarkResults(results, options, parsedSamples);
}