/**
* Deep Research Evaluation for QMD
*
* Tests end-to-end retrieval quality: query → expansion → reranking → results
*
* These are HARD queries with NO exact keyword matches - they require
* semantic understanding via query expansion and reranking to succeed.
*
* Run: bun test/eval-deep-research.ts
*/
import { execSync } from "child_process";
import { readFileSync, existsSync } from "fs";
import { join, dirname } from "path";
import { fileURLToPath } from "url";
const __dirname = dirname(fileURLToPath(import.meta.url));
interface EvalQuery {
query: string;
expected_doc: string;
difficulty: string;
intent: string; // Domain context hint for future intent-aware retrieval
notes: string;
}
interface SearchResult {
file: string;
score: number;
title?: string;
}
function loadQueries(): EvalQuery[] {
const path = join(__dirname, "eval-deep-research.jsonl");
const content = readFileSync(path, "utf-8");
return content
.split("\n")
.filter((line) => line.trim())
.map((line) => JSON.parse(line));
}
function runBM25Search(query: string): SearchResult[] {
try {
const output = execSync(
`bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
{ encoding: "utf-8", timeout: 30000 }
);
return JSON.parse(output);
} catch {
return [];
}
}
function runDeepResearch(query: string): SearchResult[] {
try {
const output = execSync(
`bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
{ encoding: "utf-8", timeout: 120000 }
);
return JSON.parse(output);
} catch {
return [];
}
}
function matchesExpected(filepath: string, expectedDoc: string): boolean {
return filepath.toLowerCase().includes(expectedDoc.toLowerCase());
}
function findRank(results: SearchResult[], expectedDoc: string): number {
for (let i = 0; i < results.length; i++) {
if (matchesExpected(results[i]!.file, expectedDoc)) {
return i + 1;
}
}
return -1; // Not found
}
interface MethodResults {
hit1: number;
hit3: number;
hit5: number;
total: number;
details: { query: string; rank: number; expected: string; intent?: string }[];
}
function evaluate(
queries: EvalQuery[],
searchFn: (q: string) => SearchResult[],
label: string
): MethodResults {
const results: MethodResults = {
hit1: 0,
hit3: 0,
hit5: 0,
total: queries.length,
details: [],
};
console.log(`\n${"=".repeat(60)}`);
console.log(` ${label}`);
console.log(`${"=".repeat(60)}\n`);
for (const { query, expected_doc, intent, notes } of queries) {
const searchResults = searchFn(query);
const rank = findRank(searchResults, expected_doc);
results.details.push({ query, rank, expected: expected_doc, intent });
if (rank === 1) results.hit1++;
if (rank >= 1 && rank <= 3) results.hit3++;
if (rank >= 1 && rank <= 5) results.hit5++;
const status =
rank === 1 ? "✓" : rank > 0 && rank <= 3 ? `@${rank}` : rank > 0 ? `@${rank}` : "✗";
const statusPad = status.padEnd(4);
console.log(` ${statusPad} "${query.slice(0, 45).padEnd(45)}" → ${expected_doc}`);
if (rank === -1) {
console.log(` intent: ${intent} | ${notes}`);
}
}
const hit1Pct = ((results.hit1 / results.total) * 100).toFixed(0);
const hit3Pct = ((results.hit3 / results.total) * 100).toFixed(0);
const hit5Pct = ((results.hit5 / results.total) * 100).toFixed(0);
console.log(`\n ${"─".repeat(50)}`);
console.log(` Hit@1: ${hit1Pct}% (${results.hit1}/${results.total})`);
console.log(` Hit@3: ${hit3Pct}% (${results.hit3}/${results.total})`);
console.log(` Hit@5: ${hit5Pct}% (${results.hit5}/${results.total})`);
return results;
}
async function main() {
console.log("QMD Deep Research Evaluation");
console.log("=".repeat(60));
console.log("Testing hard queries that require semantic understanding.");
console.log("These have NO exact keyword matches in documents.");
// Check if eval-docs collection exists
try {
const status = execSync("bun src/qmd.ts status --json 2>/dev/null", {
encoding: "utf-8",
});
if (!status.includes("eval-docs")) {
console.log("\n⚠️ eval-docs collection not found. Run:");
console.log(" qmd collection add test/eval-docs --name eval-docs");
console.log(" qmd embed");
process.exit(1);
}
} catch {
console.log("\n⚠️ Could not check status. Make sure qmd is working.");
}
const queries = loadQueries();
console.log(`\nLoaded ${queries.length} hard queries.`);
// Run BM25 baseline (expected to fail on most)
const bm25Results = evaluate(queries, runBM25Search, "BM25 BASELINE (keyword search)");
// Run deep research (expected to succeed via expansion + reranking)
const deepResults = evaluate(queries, runDeepResearch, "DEEP RESEARCH (expansion + reranking)");
// Comparison
console.log(`\n${"=".repeat(60)}`);
console.log(" COMPARISON");
console.log(`${"=".repeat(60)}`);
console.log(`\n Method Hit@1 Hit@3 Hit@5`);
console.log(` ${"─".repeat(45)}`);
console.log(
` BM25 (baseline) ${((bm25Results.hit1 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit3 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit5 / bm25Results.total) * 100).toFixed(0).padStart(3)}%`
);
console.log(
` Deep Research ${((deepResults.hit1 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit3 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit5 / deepResults.total) * 100).toFixed(0).padStart(3)}%`
);
const improvement = deepResults.hit3 - bm25Results.hit3;
console.log(`\n Improvement (Hit@3): +${improvement} queries (${((improvement / bm25Results.total) * 100).toFixed(0)}%)`);
// Show queries where deep research recovered failures
const recovered = deepResults.details.filter(
(d) =>
d.rank >= 1 &&
d.rank <= 3 &&
bm25Results.details.find((b) => b.query === d.query)?.rank === -1
);
if (recovered.length > 0) {
console.log(`\n Recovered by expansion + reranking (${recovered.length}):`);
for (const { query, rank, expected } of recovered.slice(0, 5)) {
console.log(` @${rank} "${query.slice(0, 40)}..." → ${expected}`);
}
if (recovered.length > 5) {
console.log(` ... and ${recovered.length - 5} more`);
}
}
// Exit with error if deep research performs poorly
const deepHit3Pct = (deepResults.hit3 / deepResults.total) * 100;
if (deepHit3Pct < 60) {
console.log(`\n❌ Deep research Hit@3 < 60% (${deepHit3Pct.toFixed(0)}%)`);
process.exit(1);
} else {
console.log(`\n✓ Deep research Hit@3 >= 60% (${deepHit3Pct.toFixed(0)}%)`);
}
}
main();