atomcode/crates/atomcode-core/src/tool/grep.rs-代码预览-atomcode:基于终端的 AI 编码代理项目 - AtomGit

CContributorfix(grep): 修复 Windows 路径分隔符导致的噪声目录跳过失效
use anyhow::Result;
use async_trait::async_trait;
use ignore::WalkBuilder;
use regex::RegexBuilder;
use serde::Deserialize;
use serde_json::json;

use super::{ApprovalRequirement, Tool, ToolContext, ToolDef, ToolResult};

pub struct GrepTool;

#[derive(Deserialize)]
struct GrepArgs {
    pattern: String,
    path: Option<String>,
    #[serde(default = "default_max_results")]
    max_results: usize,
    #[serde(default = "default_context")]
    context: usize,
}

fn default_context() -> usize {
    3
}
fn default_max_results() -> usize {
    50
}

#[async_trait]
impl Tool for GrepTool {
    fn definition(&self) -> ToolDef {
        ToolDef {
            name: "grep",
            description: "Search file contents for a pattern. Returns matching lines with surrounding context.\n\
                Usage:\n\
                - Use this to find where a function, variable, string, or UI element is defined or used.\n\
                - Use this BEFORE editing when the user's request is ambiguous — find ALL candidates first.\n\
                - Pattern is regex by default (case-insensitive unless uppercase is used).\n\
                - Escape special regex chars: . → \\\\. , ( → \\\\( , [ → \\\\[\n\
                - If regex fails, the tool automatically retries with literal string matching.\n\
                - NEVER use bash grep/rg — always use this tool.\n\
                Examples:\n\
                - Find a function: {\"pattern\": \"def process_data\"}\n\
                - Find a string with dots: {\"pattern\": \"console\\\\.log\"}\n\
                - Find across alternatives: {\"pattern\": \"upload|上传\"}\n\
                - Search specific directory: {\"pattern\": \"import\", \"path\": \"src/views\"}".to_string(),
            parameters: json!({
                "type": "object",
                "properties": {
                    "pattern": { "type": "string", "description": "Search pattern (regex by default). Escape dots/parens: console\\.log\\(" },
                    "path": { "type": "string", "description": "Directory or file to search (default: working directory)" },
                    "max_results": { "type": "integer", "description": "Max results to return (default 50)" },
                    "context": { "type": "integer", "description": "Lines of context around each match (default 3)" }
                },
                "required": ["pattern"]
            }),
        }
    }

    fn approval(&self, _args: &str) -> ApprovalRequirement {
        ApprovalRequirement::AutoApprove
    }

    fn approval_with_context(&self, args: &str, ctx: &ToolContext) -> ApprovalRequirement {
        let parsed = match serde_json::from_str::<GrepArgs>(args) {
            Ok(parsed) => parsed,
            Err(_) => return self.approval(args),
        };
        let working_dir = match ctx.working_dir.try_read() {
            Ok(wd) => wd.clone(),
            Err(_) => return self.approval(args),
        };
        let raw_path = parsed.path.as_deref().unwrap_or(".");
        match super::approval_for_path(raw_path, &working_dir, super::ExternalPathAction::Read)
        {
            Ok(approval) => approval,
            Err(_) => self.approval(args),
        }
    }

    async fn execute(&self, args: &str, ctx: &ToolContext) -> Result<ToolResult> {
        let parsed: GrepArgs = serde_json::from_str(args)?;
        let path = parsed.path.as_deref().unwrap_or(".");

        let wd = ctx.working_dir.read().await.clone();

        // Graph enhancement: build a short header when pattern looks like an
        // identifier and no specific path is given (project-wide search).
        // The header is prepended to ripgrep results — it never replaces them.
        let graph_header = if parsed.path.is_none() {
            self.build_graph_header(&parsed.pattern, ctx, &wd).await
        } else {
            None
        };

        let max = parsed.max_results;
        let context_lines = parsed.context.min(10);
        let resolved = match super::inspect_path_access(path, &wd) {
            Ok(access) => access.path,
            Err(err) => {
                return Ok(ToolResult {
                    call_id: String::new(),
                    output: err.to_string(),
                    success: false,
                });
            }
        };

        if !resolved.exists() {
            return Ok(ToolResult {
                call_id: String::new(),
                output: format!("Path not found: {}", resolved.display()),
                success: false,
            });
        }

        // Build regex (smart-case: case-insensitive if pattern has no uppercase)
        let has_uppercase = parsed.pattern.chars().any(|c| c.is_uppercase());
        let re = match RegexBuilder::new(&parsed.pattern)
            .case_insensitive(!has_uppercase)
            .build()
        {
            Ok(r) => r,
            Err(_) => {
                // Regex failed — try as literal
                match RegexBuilder::new(&regex::escape(&parsed.pattern))
                    .case_insensitive(!has_uppercase)
                    .build()
                {
                    Ok(r) => r,
                    Err(e) => {
                        return Ok(ToolResult {
                            call_id: String::new(),
                            output: format!("Invalid pattern '{}': {}", parsed.pattern, e),
                            success: false,
                        });
                    }
                }
            }
        };

        // The `ignore` walk and per-file `std::fs::read_to_string` are BLOCKING
        // syscalls. On a hung filesystem (e.g. a dead network mount) they would
        // stall the async worker thread, so the turn-level `cancel.cancelled()`
        // race in execute_single_tool never gets polled and Esc can't cancel.
        // Run the whole traversal on the blocking pool so the executor stays
        // responsive: cancel aborts the turn promptly (the orphaned blocking
        // thread is abandoned and dies when the OS I/O finally errors).
        let (matches, files_searched) = tokio::task::spawn_blocking({
            let resolved = resolved.clone();
            let wd = wd.clone();
            move || grep_walk(&resolved, &re, &wd, max, context_lines)
        })
        .await
        .unwrap_or_else(|_| (Vec::new(), 0usize));

        // Annotate matching lines with enclosing function name (tree-sitter)
        let mut searcher = ctx.semantic.lock().await;
        let mut annotated: Vec<String> = Vec::new();
        let mut sym_cache: std::collections::HashMap<String, Vec<crate::semantic::Symbol>> =
            std::collections::HashMap::new();

        for line in &matches {
            let parts: Vec<&str> = line.splitn(3, ':').collect();
            if parts.len() >= 3 {
                if let Ok(line_no) = parts[1].parse::<usize>() {
                    let file = parts[0];
                    let abs_file = if std::path::Path::new(file).is_absolute() {
                        std::path::PathBuf::from(file)
                    } else {
                        wd.join(file)
                    };
                    let symbols = sym_cache
                        .entry(file.to_string())
                        .or_insert_with(|| searcher.list_symbols(&abs_file).unwrap_or_default());
                    if let Some(sym) = symbols
                        .iter()
                        .find(|s| line_no >= s.start_line && line_no <= s.end_line)
                    {
                        annotated.push(format!("{}  ← in {}()", line, sym.name));
                        continue;
                    }
                }
            }
            annotated.push(line.clone());
        }
        drop(searcher);

        let mut output = String::new();

        if let Some(header) = graph_header {
            output.push_str(&header);
            output.push('\n');
        }

        if annotated.is_empty() {
            output.push_str(&format!(
                "No matches found for '{}' in {}",
                parsed.pattern, path
            ));
            output.push_str(&format!(" ({} files searched)", files_searched));
        } else {
            let total = annotated.len();
            output.push_str(&annotated.join("\n"));
            if total >= max {
                output.push_str(&format!("\n\n[Results capped at {} matches]", max));
            }
        };

        // success=true even on zero matches: empty result IS the answer
        // (matches glob's "No files matching" semantics — same screenshot
        // showed glob's empty-result line rendering normally while grep's
        // identical-meaning line painted red ✗ as if the search failed).
        // Real failures (bad path / bad regex) take the early-return
        // branches above with their own success=false.
        Ok(ToolResult {
            call_id: String::new(),
            output,
            success: true,
        })
    }
}

impl GrepTool {
    /// Build a short graph-based header for the grep output.
    /// Returns None if graph is not ready, no identifier is extractable,
    /// or no symbols match.
    async fn build_graph_header(
        &self,
        pattern: &str,
        ctx: &ToolContext,
        wd: &std::path::Path,
    ) -> Option<String> {
        let query_word = extract_graph_candidates(pattern)?;
        let graph = ctx.graph.read().await;
        if !graph.is_ready() {
            return None;
        }

        let symbols = graph.find_by_name(&query_word);
        if symbols.is_empty() {
            return None;
        }

        let mut out = format!(
            "[Graph: {} definitions for '{}']\n",
            symbols.len(),
            query_word
        );
        for sym in symbols.iter().take(5) {
            let rel = sym
                .file
                .strip_prefix(wd)
                .unwrap_or(&sym.file)
                .to_string_lossy();
            out.push_str(&format!(
                "  {} {:?} in {}:{}\n",
                sym.name, sym.kind, rel, sym.start_line
            ));
        }

        Some(out)
    }
}

/// Synchronous file-tree walk + per-file regex scan, pulled out of
/// `GrepTool::execute` so it can run inside `tokio::task::spawn_blocking`
/// (the `ignore` crate has no async API and the per-file reads block).
/// Keeping it off the async worker keeps cancellation responsive while a
/// hung filesystem is traversed. Returns (formatted match lines, files searched).
fn grep_walk(
    resolved: &std::path::Path,
    re: &regex::Regex,
    wd: &std::path::Path,
    max: usize,
    context_lines: usize,
) -> (Vec<String>, usize) {
    let walker = WalkBuilder::new(resolved)
        .hidden(true)
        .git_ignore(true)
        .git_global(true)
        .git_exclude(true)
        .build();

    let mut matches: Vec<String> = Vec::new();
    let mut files_searched = 0usize;
    let mut match_count = 0usize;

    for entry in walker {
        let entry = match entry {
            Ok(e) => e,
            Err(_) => continue,
        };

        if !entry.file_type().map_or(false, |ft| ft.is_file()) {
            continue;
        }

        let file_path = entry.path();

        // Skip known noise directories/files not covered by .gitignore.
        // Use component-based checks instead of string contains so the logic
        // works correctly on Windows (backslash separators) as well as Unix.
        let is_noise_dir = file_path.components().any(|c| {
            matches!(c, std::path::Component::Normal(name) if {
                let name = name.to_string_lossy();
                name == "datalog" || name == "target" || name == "dist" || name == "node_modules"
            })
        });
        let is_log_file = file_path
            .extension()
            .and_then(|e| e.to_str())
            .map_or(false, |e| e.eq_ignore_ascii_case("log"));
        if is_noise_dir || is_log_file {
            continue;
        }

        // Read file (skip binary)
        let content = match std::fs::read_to_string(file_path) {
            Ok(c) => c,
            Err(_) => continue, // binary or unreadable
        };

        files_searched += 1;
        let lines: Vec<&str> = content.lines().collect();

        // Find matching lines
        let mut file_matches: Vec<usize> = Vec::new();
        for (i, line) in lines.iter().enumerate() {
            if re.is_match(line) {
                file_matches.push(i);
                if match_count + file_matches.len() >= max {
                    break;
                }
            }
        }

        if file_matches.is_empty() {
            continue;
        }

        // Format matches with context
        let rel_path = file_path
            .strip_prefix(wd)
            .unwrap_or(file_path)
            .to_string_lossy();

        let mut shown: std::collections::HashSet<usize> = std::collections::HashSet::new();
        for &match_line in &file_matches {
            let start = match_line.saturating_sub(context_lines);
            let end = (match_line + context_lines + 1).min(lines.len());

            // Separator between non-contiguous chunks
            if !shown.is_empty() && start > 0 && !shown.contains(&(start - 1)) {
                matches.push("--".to_string());
            }

            for i in start..end {
                if shown.contains(&i) {
                    continue;
                }
                shown.insert(i);

                let prefix = if i == match_line {
                    format!("{}:{}:", rel_path, i + 1)
                } else {
                    format!("{}-{}-", rel_path, i + 1)
                };
                matches.push(format!("{}{}", prefix, lines[i]));
            }
        }

        match_count += file_matches.len();
        if match_count >= max {
            break;
        }
    }
    (matches, files_searched)
}

/// Extract a likely code identifier from a grep pattern for graph lookup.
///
/// Only returns Some when the word strongly resembles a code symbol:
/// - Contains underscore (snake_case): "fetch_weather" → Some("fetch_weather")
/// - Contains mixed case (CamelCase): "SearchFilter" → Some("SearchFilter")
/// - Pure lowercase words like "error", "table", "search" are rejected to
///   avoid false positives with common English words.
///
/// Examples:
/// - "fetch_weather" → Some("fetch_weather")
/// - "SearchFilter"  → Some("SearchFilter")
/// - "NdarrayMixin"  → Some("NdarrayMixin")
/// - "weather|天气"  → None  (no identifier-like word)
/// - "Structured ndarray gets viewed" → None  (plain English)
/// - "error.*line" → None  (too generic)
/// - "console\\.log" → None  (too generic)
fn extract_graph_candidates(pattern: &str) -> Option<String> {
    let skip_keywords = [
        "pub", "fn", "struct", "enum", "impl", "use", "let", "const", "async", "trait", "type",
        "mod", "crate", "self", "super", "for", "def", "class", "function", "var", "import",
        "from", "export", "return", "match", "where", "static", "mut", "ref", "true", "false",
        "none", "some", "null", "this", "not", "and", "the", "with",
    ];

    let mut best: Option<String> = None;
    let mut best_len = 0;

    for word in pattern.split(|c: char| !c.is_ascii_alphanumeric() && c != '_') {
        let w = word.trim();
        if w.len() < 4 {
            continue;
        }
        if !w
            .chars()
            .next()
            .map(|c| c.is_ascii_alphabetic() || c == '_')
            .unwrap_or(false)
        {
            continue;
        }
        if !w.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
            continue;
        }
        if skip_keywords.contains(&w.to_lowercase().as_str()) {
            continue;
        }

        let has_underscore = w.contains('_');
        // Require real CamelCase (lower→upper transition like "SearchFilter")
        // not just a capitalized word ("Structured").
        let has_camel_transition = w
            .as_bytes()
            .windows(2)
            .any(|pair| pair[0].is_ascii_lowercase() && pair[1].is_ascii_uppercase());

        if !has_underscore && !has_camel_transition {
            continue;
        }

        if w.len() > best_len {
            best = Some(w.to_string());
            best_len = w.len();
        }
    }

    best
}

#[cfg(test)]
mod tests {
    use super::extract_graph_candidates;
    use super::GrepTool;
    use crate::tool::{ApprovalRequirement, Tool, ToolContext};
    use tempfile::TempDir;

    #[test]
    fn grep_outside_workspace_non_sensitive_requires_approval() {
        let workspace = TempDir::new().unwrap();
        let outside = TempDir::new().unwrap();
        let ctx = ToolContext::new(workspace.path().to_path_buf());
        let args = format!(
            r#"{{"pattern":"foo","path":"{}"}}"#,
            outside.path().display()
        );
        assert!(matches!(
            GrepTool.approval_with_context(&args, &ctx),
            ApprovalRequirement::RequireApproval(_)
        ));
    }

    #[test]
    fn grep_sensitive_path_still_requires_always() {
        let workspace = TempDir::new().unwrap();
        let ctx = ToolContext::new(workspace.path().to_path_buf());
        // /etc is in the system-protected prefixes list.
        let args = r#"{"pattern":"PermitRoot","path":"/etc"}"#;
        assert!(matches!(
            GrepTool.approval_with_context(args, &ctx),
            ApprovalRequirement::RequireApprovalAlways(_)
        ));
    }

    // Regression: zero-match grep used to return success=false, which
    // made the TUI render the result row red ✗ as if the search itself
    // failed (screenshot 43.png had two such red rows next to genuine
    // path-not-found errors, making the agent's normal exploration look
    // like a wall of failures). "No matches" is a valid answer — it's
    // the *path* / *regex* errors that are real failures (those still
    // return success=false via the early-return branches). Pin the
    // semantics so future churn doesn't regress.
    #[tokio::test]
    async fn grep_zero_matches_reports_success_true() {
        let workspace = TempDir::new().unwrap();
        // Seed a file so the search has something to walk; the pattern
        // intentionally won't match.
        std::fs::write(
            workspace.path().join("a.rs"),
            "fn alpha() {}\nfn beta() {}\n",
        )
        .unwrap();
        let ctx = ToolContext::new(workspace.path().to_path_buf());
        let args = r#"{"pattern":"definitely_not_in_file_xyz"}"#;
        let result = GrepTool.execute(args, &ctx).await.unwrap();
        assert!(
            result.success,
            "zero-match grep must return success=true so TUI doesn't \
             paint it red. Output was: {}",
            result.output
        );
        assert!(
            result.output.contains("No matches found"),
            "zero-match output should explain what happened, got: {}",
            result.output
        );
    }

    // Real failure modes (bad regex / missing path) MUST still be
    // success=false — those genuinely indicate the model needs to
    // change inputs, not just learn that "no rows match".
    #[tokio::test]
    async fn grep_path_not_found_reports_success_false() {
        let workspace = TempDir::new().unwrap();
        let ctx = ToolContext::new(workspace.path().to_path_buf());
        let args = r#"{"pattern":"foo","path":"/nonexistent/path/xyz123"}"#;
        let result = GrepTool.execute(args, &ctx).await.unwrap();
        assert!(
            !result.success,
            "path-not-found must remain success=false — output: {}",
            result.output
        );
    }

    // Default path "." resolves to the workspace itself — must auto-approve.
    #[test]
    fn grep_default_path_auto_approves() {
        let workspace = TempDir::new().unwrap();
        let ctx = ToolContext::new(workspace.path().to_path_buf());
        let args = r#"{"pattern":"foo"}"#;
        assert!(matches!(
            GrepTool.approval_with_context(args, &ctx),
            ApprovalRequirement::AutoApprove
        ));
    }

    #[test]
    fn snake_case_identifier() {
        assert_eq!(
            extract_graph_candidates("fetch_weather"),
            Some("fetch_weather".into())
        );
    }

    #[test]
    fn camel_case_identifier() {
        assert_eq!(
            extract_graph_candidates("SearchFilter"),
            Some("SearchFilter".into())
        );
        assert_eq!(
            extract_graph_candidates("NdarrayMixin"),
            Some("NdarrayMixin".into())
        );
    }

    #[test]
    fn mixed_cjk_ascii_no_identifier() {
        assert_eq!(extract_graph_candidates("weather|天气"), None);
    }

    #[test]
    fn plain_english_rejected() {
        assert_eq!(
            extract_graph_candidates("Structured ndarray gets viewed"),
            None
        );
        assert_eq!(extract_graph_candidates("error"), None);
        assert_eq!(extract_graph_candidates("table"), None);
        assert_eq!(extract_graph_candidates("search"), None);
        assert_eq!(extract_graph_candidates("console"), None);
    }

    #[test]
    fn regex_pattern_rejected() {
        assert_eq!(extract_graph_candidates("error.*line"), None);
        assert_eq!(extract_graph_candidates("console\\.log"), None);
    }

    #[test]
    fn keywords_rejected() {
        assert_eq!(extract_graph_candidates("pub struct"), None);
        assert_eq!(extract_graph_candidates("import from"), None);
    }

    #[test]
    fn keyword_then_identifier() {
        assert_eq!(
            extract_graph_candidates("pub struct QueryIntent"),
            Some("QueryIntent".into())
        );
        assert_eq!(
            extract_graph_candidates("def process_data"),
            Some("process_data".into())
        );
    }

    #[test]
    fn pure_chinese_rejected() {
        assert_eq!(extract_graph_candidates("(科技|财经|体育)"), None);
    }

    #[test]
    fn short_words_rejected() {
        assert_eq!(extract_graph_candidates("fn"), None);
        assert_eq!(extract_graph_candidates("FnX"), None); // 3 chars, below threshold of 4
    }

    #[test]
    fn or_pattern_picks_best_identifier() {
        assert_eq!(
            extract_graph_candidates("SearchFilter|from_intent"),
            Some("SearchFilter".into()),
        );
    }

    #[test]
    fn not_prefix_rejected() {
        // "not data_is_mixin" — "not" is a keyword, "data_is_mixin" has underscore → match
        assert_eq!(
            extract_graph_candidates("not data_is_mixin"),
            Some("data_is_mixin".into())
        );
    }
}