nop-entropy/ai-dev/tools/check-docs-garbled.mjs-代码预览-nop-entropy:基于可逆计算的低代码开发平台后端，支持领域语言与云原生部署 - AtomGit

canonical-entropychore(tools): 新增check-plan-checklist工具 + 增强check-doc-links源码路径检查
#!/usr/bin/env node

/**
 * check-docs-garbled.mjs
 *
 * Detect garbled/corrupted characters in documentation files.
 * Scans docs-for-ai/ and ai-dev/ for suspicious Unicode patterns.
 *
 * Usage:
 *   node check-docs-garbled.mjs
 *
 * Adapted from nop-chaos-flux/scripts/check-docs-garbled.mjs
 */

import { execFile } from 'node:child_process';
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { promisify } from 'node:util';

const execFileAsync = promisify(execFile);

const PROJECT_ROOT = new URL('../..', import.meta.url).pathname;
const OUTPUT_DIR = new URL('../_tmp/docs-garbled-check', import.meta.url).pathname;
const SCAN_DIRS = ['docs-for-ai', 'ai-dev'];

const textExtensions = new Set([
  '.md', '.mdx', '.txt', '.json', '.yaml', '.yml', '.html', '.xml',
]);

const suspiciousSingles = new Map([
  ['\uFFFD', 'replacement-char'],
  ['\uFEFF', 'bom'],
  ['\u200B', 'zero-width-space'],
  ['\u200C', 'zero-width-non-joiner'],
  ['\u200D', 'zero-width-joiner'],
  ['\u2060', 'word-joiner'],
]);

const mojibakePattern = /(?:Ã.|Â.|â€¦|â€"|â€"|â€|ðŸ|Ð.|Ñ.|æ.|ç.|ä.|å.)/u;
const controlPattern = /^\p{Control}$/u;
const formatPattern = /^\p{Format}$/u;
const privateUsePattern = /^\p{Private_Use}$/u;
const noncharacterPattern = /^\p{Noncharacter_Code_Point}$/u;
const letterPattern = /^\p{Letter}$/u;
const numberPattern = /^\p{Number}$/u;
const punctuationPattern = /^\p{Punctuation}$/u;
const symbolPattern = /^\p{Symbol}$/u;
const separatorPattern = /^\p{Separator}$/u;
const markPattern = /^\p{Mark}$/u;
const hanPattern = /^\p{Script=Han}$/u;
const asciiLetterPattern = /^[A-Za-z]$/u;

function getLineColumn(content, index) {
  let line = 1, column = 1;
  for (let i = 0; i < index; i++) {
    if (content[i] === '\n') { line++; column = 1; } else { column++; }
  }
  return { line, column };
}

function getLineText(content, lineNumber) {
  return content.split(/\r?\n/u)[lineNumber - 1] ?? '';
}

function getContextSnippet(lineText, column) {
  return lineText.slice(Math.max(0, column - 21), Math.min(lineText.length, column + 20));
}

function getCodePointLabel(character) {
  return `U+${character.codePointAt(0).toString(16).toUpperCase().padStart(4, '0')}`;
}

function isAllowedCharacter(character, index) {
  if (character === '\n' || character === '\r' || character === '\t') return true;
  if (character >= ' ' && character <= '~') return true;
  if (hanPattern.test(character)) return true;
  if (index === 0 && character === '\uFEFF') return true;
  if (numberPattern.test(character) || punctuationPattern.test(character) ||
      symbolPattern.test(character) || separatorPattern.test(character) || markPattern.test(character)) return true;
  return false;
}

function classifyCharacter(character, index) {
  if (isAllowedCharacter(character, index)) return null;
  if (suspiciousSingles.has(character)) return suspiciousSingles.get(character);
  if (controlPattern.test(character)) return 'control-char';
  if (formatPattern.test(character)) return 'format-char';
  if (privateUsePattern.test(character)) return 'private-use-char';
  if (noncharacterPattern.test(character)) return 'noncharacter';
  if (letterPattern.test(character)) {
    if (asciiLetterPattern.test(character) || hanPattern.test(character)) return null;
    return 'unexpected-letter';
  }
  return 'unexpected-char';
}

function scoreOccurrence(type, context) {
  if (['replacement-char', 'control-char', 'private-use-char', 'noncharacter'].includes(type)) return 5;
  if (['zero-width-space', 'zero-width-non-joiner', 'zero-width-joiner'].includes(type)) return 4;
  if (['format-char', 'word-joiner', 'bom'].includes(type)) return 3;
  if (type === 'unexpected-letter') return mojibakePattern.test(context) ? 4 : 2;
  return 2;
}

function buildVerdict(occurrences) {
  let score = 0;
  let hasHighConfidence = false;
  for (const occ of occurrences) {
    score += scoreOccurrence(occ.type, occ.context);
    if (['replacement-char', 'control-char', 'private-use-char', 'noncharacter'].includes(occ.type) ||
        mojibakePattern.test(occ.context)) hasHighConfidence = true;
  }
  return hasHighConfidence || score >= 6
    ? { status: 'likely-garbled', score }
    : { status: 'needs-review', score };
}

async function getDocFiles() {
  const dirArgs = SCAN_DIRS.flatMap(d => [d]);
  try {
    const { stdout } = await execFileAsync('git', ['ls-files', ...dirArgs], {
      cwd: PROJECT_ROOT, maxBuffer: 20 * 1024 * 1024,
    });
    return stdout.split(/\r?\n/u).filter(Boolean).filter(
      f => textExtensions.has(f.split('.').pop()?.toLowerCase() ? `.${f.split('.').pop()}` : '')
    );
  } catch { return []; }
}

async function scanFile(relativePath) {
  const content = await readFile(new URL(`../../${relativePath}`, import.meta.url), 'utf8');
  const occurrences = [];
  for (let index = 0; index < content.length;) {
    const codePoint = content.codePointAt(index);
    const character = String.fromCodePoint(codePoint);
    const width = codePoint > 0xffff ? 2 : 1;
    const type = classifyCharacter(character, index);
    if (type) {
      const { line, column } = getLineColumn(content, index);
      const lineText = getLineText(content, line);
      occurrences.push({ type, character, codePoint: getCodePointLabel(character), line, column, context: getContextSnippet(lineText, column) });
    }
    index += width;
  }
  return { relativePath, occurrences };
}

async function main() {
  await mkdir(OUTPUT_DIR, { recursive: true });
  const files = await getDocFiles();
  const scans = await Promise.all(files.map(f => scanFile(f)));
  const candidates = scans.filter(s => s.occurrences.length > 0).map(item => ({
    file: item.relativePath,
    occurrenceCount: item.occurrences.length,
    occurrenceTypes: [...new Set(item.occurrences.map(o => o.type))],
    occurrences: item.occurrences,
  })).sort((a, b) => a.file.localeCompare(b.file));

  const verdicts = candidates.map(c => {
    const v = buildVerdict(c.occurrences);
    return { file: c.file, status: v.status, score: v.score, occurrenceCount: c.occurrenceCount, occurrenceTypes: c.occurrenceTypes, examples: c.occurrences.slice(0, 20) };
  });

  await writeFile(require('node:path').join(OUTPUT_DIR, 'verdicts.json'), JSON.stringify(verdicts, null, 2) + '\n');

  const likelyCount = verdicts.filter(v => v.status === 'likely-garbled').length;
  const reviewCount = verdicts.filter(v => v.status === 'needs-review').length;
  console.log(`[check-docs-garbled] Scanned ${files.length} docs files`);
  console.log(`[check-docs-garbled] Candidates: ${candidates.length}, Likely garbled: ${likelyCount}, Needs review: ${reviewCount}`);

  if (verdicts.length > 0) {
    console.log(`[check-docs-garbled] Report: ${OUTPUT_DIR}/verdicts.json`);
  }
  process.exit(likelyCount > 0 ? 1 : 0);
}

main();