qmd/test/ast-chunking.test.ts-代码预览-qmd:基于本地技术的文档搜索引擎项目 - AtomGit

TTobias Lütkechore: migrate AST chunking tests to vitest
/**
 * Integration tests for AST-aware chunking.
 *
 * Migrated from the standalone test-ast-chunking.mjs script into the
 * vitest suite. Covers the integration between AST break point extraction
 * and the chunking pipeline — areas not tested by the unit-level ast.test.ts.
 */

import { describe, test, expect } from "vitest";
import { getASTBreakPoints } from "../src/ast.js";
import {
  chunkDocument,
  chunkDocumentAsync,
  chunkDocumentWithBreakPoints,
  mergeBreakPoints,
  scanBreakPoints,
  findCodeFences,
} from "../src/store.js";

// ==========================================================================
// mergeBreakPoints
// ==========================================================================

describe("mergeBreakPoints", () => {
  test("merges regex and AST break points, higher score wins at same position", () => {
    const regexPoints = [
      { pos: 10, score: 20, type: "blank" },
      { pos: 50, score: 1, type: "newline" },
      { pos: 100, score: 20, type: "blank" },
    ];
    const astPoints = [
      { pos: 10, score: 90, type: "ast:func" },
      { pos: 75, score: 100, type: "ast:class" },
      { pos: 100, score: 60, type: "ast:import" },
    ];

    const merged = mergeBreakPoints(regexPoints, astPoints);

    expect(merged).toHaveLength(4);
    expect(merged.find(p => p.pos === 10)?.score).toBe(90);   // AST wins (90 > 20)
    expect(merged.find(p => p.pos === 50)?.score).toBe(1);    // regex only
    expect(merged.find(p => p.pos === 75)?.score).toBe(100);  // AST only
    expect(merged.find(p => p.pos === 100)?.score).toBe(60);  // AST wins (60 > 20)
  });

  test("result is sorted by position", () => {
    const merged = mergeBreakPoints(
      [{ pos: 100, score: 10, type: "a" }],
      [{ pos: 5, score: 50, type: "b" }],
    );
    expect(merged[0]!.pos).toBeLessThan(merged[1]!.pos);
  });
});

// ==========================================================================
// AST vs Regex chunking comparison
// ==========================================================================

describe("AST vs Regex chunking", () => {
  // Generate a large TS file with 30 functions
  const parts: string[] = [];
  for (let i = 0; i < 30; i++) {
    parts.push(`
export function handler${i}(req: Request, res: Response): void {
  const startTime = Date.now();
  const userId = req.params.userId;
  const sessionToken = req.headers.authorization;

  if (!userId || !sessionToken) {
    res.status(400).json({ error: "Missing required parameters" });
    return;
  }

  console.log(\`Processing request ${i} for user \${userId}\`);
  const result = processBusinessLogic${i}(userId, sessionToken);

  const elapsed = Date.now() - startTime;
  res.json({ data: result, processingTimeMs: elapsed });
}
`);
  }
  const largeTS = parts.join("\n");

  function countSplitFunctions(chunks: { text: string; pos: number }[]): number {
    let splits = 0;
    for (let i = 0; i < 30; i++) {
      const funcStart = largeTS.indexOf(`function handler${i}(`);
      const nextFunc = largeTS.indexOf(`function handler${i + 1}(`, funcStart + 1);
      const funcEnd = nextFunc > 0 ? nextFunc : largeTS.length;
      const chunkIndices = new Set<number>();
      for (let ci = 0; ci < chunks.length; ci++) {
        const chunkStart = chunks[ci]!.pos;
        const chunkEnd = chunkStart + chunks[ci]!.text.length;
        if (chunkStart < funcEnd && chunkEnd > funcStart) {
          chunkIndices.add(ci);
        }
      }
      if (chunkIndices.size > 1) splits++;
    }
    return splits;
  }

  test("AST splits fewer functions across chunk boundaries than regex", async () => {
    const regexChunks = chunkDocument(largeTS);
    const astChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "auto");

    const regexSplits = countSplitFunctions(regexChunks);
    const astSplits = countSplitFunctions(astChunks);

    expect(astSplits).toBeLessThanOrEqual(regexSplits);
  });

  test("markdown files produce identical chunks in auto vs regex mode", async () => {
    const sections: string[] = [];
    for (let i = 0; i < 15; i++) {
      sections.push(`# Section ${i}\n\n${"Lorem ipsum dolor sit amet. ".repeat(40)}\n`);
    }
    const largeMD = sections.join("\n");

    const mdRegex = chunkDocument(largeMD);
    const mdAst = await chunkDocumentAsync(largeMD, undefined, undefined, undefined, "readme.md", "auto");

    expect(mdAst).toHaveLength(mdRegex.length);
    for (let i = 0; i < mdRegex.length; i++) {
      expect(mdAst[i]?.text).toBe(mdRegex[i]?.text);
      expect(mdAst[i]?.pos).toBe(mdRegex[i]?.pos);
    }
  });

  test("regex strategy bypasses AST entirely", async () => {
    const regexOnly = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, "handlers.ts", "regex");
    const syncRegex = chunkDocument(largeTS);

    expect(regexOnly).toHaveLength(syncRegex.length);
    for (let i = 0; i < syncRegex.length; i++) {
      expect(regexOnly[i]?.text).toBe(syncRegex[i]?.text);
    }
  });

  test("no filepath falls back to regex", async () => {
    const noPathChunks = await chunkDocumentAsync(largeTS, undefined, undefined, undefined, undefined, "auto");
    const syncRegex = chunkDocument(largeTS);
    expect(noPathChunks).toHaveLength(syncRegex.length);
  });

  test("small file produces single chunk", async () => {
    const smallChunks = await chunkDocumentAsync("export const x = 1;", undefined, undefined, undefined, "s.ts", "auto");
    expect(smallChunks).toHaveLength(1);
  });
});

// ==========================================================================
// chunkDocumentWithBreakPoints equivalence
// ==========================================================================

describe("chunkDocumentWithBreakPoints equivalence", () => {
  test("produces identical output to chunkDocument for the same content", () => {
    const content = "a".repeat(5000) + "\n\n" + "b".repeat(5000);
    const old = chunkDocument(content);
    const withBP = chunkDocumentWithBreakPoints(content, scanBreakPoints(content), findCodeFences(content));

    expect(withBP).toHaveLength(old.length);
    for (let i = 0; i < old.length; i++) {
      expect(withBP[i]?.text).toBe(old[i]?.text);
      expect(withBP[i]?.pos).toBe(old[i]?.pos);
    }
  });
});

// ==========================================================================
// Score assertions not covered by ast.test.ts unit tests
// ==========================================================================

describe("AST break point scores", () => {
  test("TypeScript export (class) scores 90", async () => {
    const code = `export class Foo {}\nexport function bar() {}`;
    const points = await getASTBreakPoints(code, "a.ts");
    const exportPoint = points.find(p => p.type === "ast:export");
    expect(exportPoint?.score).toBe(90);
  });

  test("Python class scores 100", async () => {
    const code = `class Foo:\n    pass\n\ndef bar():\n    pass`;
    const points = await getASTBreakPoints(code, "a.py");
    expect(points.find(p => p.type === "ast:class")?.score).toBe(100);
  });

  test("Go type scores 80", async () => {
    const code = `package main\n\ntype Server struct {\n    port int\n}\n\nfunc main() {}`;
    const points = await getASTBreakPoints(code, "a.go");
    expect(points.find(p => p.type === "ast:type")?.score).toBe(80);
  });

  test("Rust enum scores 80", async () => {
    const code = `enum State {\n    On,\n    Off,\n}\n\nfn main() {}`;
    const points = await getASTBreakPoints(code, "a.rs");
    expect(points.find(p => p.type === "ast:enum")?.score).toBe(80);
  });
});