use crate::config::Config;
use crate::conversation::message::{ImagePart, Message, MessageContent, Role};
use crate::provider::{create_provider, model_name_suggests_vision, LlmProvider};
use futures::StreamExt;
#[derive(Debug, Clone)]
pub enum PreprocessOutcome {
Skipped,
Replaced { text: String, vl_key: String },
Failed { reason: String },
}
pub async fn maybe_preprocess(
config: &Config,
active_provider: &dyn LlmProvider,
caption: &str,
images: &[ImagePart],
) -> PreprocessOutcome {
if images.is_empty() {
return PreprocessOutcome::Skipped;
}
if model_name_suggests_vision(active_provider.model_name()) {
return PreprocessOutcome::Skipped;
}
let vl_key = match config.vision_preprocessor_provider.as_deref() {
Some(k) if !k.is_empty() => k,
_ => return PreprocessOutcome::Skipped,
};
let vl_cfg = match config.providers.get(vl_key) {
Some(c) => c.clone(),
None => {
return PreprocessOutcome::Failed {
reason: format!("VL provider '{vl_key}' not found in config.providers"),
};
}
};
let vl_provider = match create_provider(&vl_cfg) {
Ok(p) => p,
Err(e) => {
return PreprocessOutcome::Failed {
reason: format!("VL provider build failed: {e:#}"),
};
}
};
let prompt = if caption.trim().is_empty() {
"请详细描述这张图片的内容。如果是代码、报错截图或终端输出,请逐字转录文本。"
.to_string()
} else {
format!(
"用户的当前请求:{caption}\n\n请详细描述这张图片的内容。如果是代码、\
报错截图或终端输出,请逐字转录文本。",
)
};
let messages = vec![Message {
role: Role::User,
content: MessageContent::MultiPart {
text: Some(prompt),
images: images.to_vec(),
},
synthetic: false,
}];
const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
let mut stream = match vl_provider.chat_stream(&messages, None) {
Ok(s) => s,
Err(e) => {
return PreprocessOutcome::Failed {
reason: format!("provider '{vl_key}' stream init failed: {e:#}"),
};
}
};
let mut buf = String::new();
loop {
let next = match tokio::time::timeout(IDLE_TIMEOUT, stream.next()).await {
Ok(n) => n,
Err(_) => {
return PreprocessOutcome::Failed {
reason: format!(
"provider '{vl_key}' no progress for {}s",
IDLE_TIMEOUT.as_secs(),
),
};
}
};
let event = match next {
None => break,
Some(Ok(ev)) => ev,
Some(Err(e)) => {
return PreprocessOutcome::Failed {
reason: format!("provider '{vl_key}' call error: {e:#}"),
};
}
};
match event {
crate::stream::StreamEvent::Delta(s) => buf.push_str(&s),
crate::stream::StreamEvent::Reasoning(_) => {}
crate::stream::StreamEvent::Done { .. } => break,
crate::stream::StreamEvent::Error(e) => {
return PreprocessOutcome::Failed {
reason: format!("provider '{vl_key}' call error: {e}"),
};
}
crate::stream::StreamEvent::Warning(_)
| crate::stream::StreamEvent::Usage(_)
| crate::stream::StreamEvent::ThinkingBlock { .. }
| crate::stream::StreamEvent::ToolCallStart { .. }
| crate::stream::StreamEvent::ToolCallDelta(_)
| crate::stream::StreamEvent::ToolCallDone(_) => {}
}
}
let trimmed = buf.trim();
if trimmed.is_empty() {
PreprocessOutcome::Failed {
reason: format!("provider '{vl_key}' returned empty response"),
}
} else {
PreprocessOutcome::Replaced {
text: trimmed.to_string(),
vl_key: vl_key.to_string(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::provider::ProviderConfig;
use std::collections::HashMap;
fn blank_config() -> Config {
Config {
default_provider: String::new(),
default_workdir: None,
providers: HashMap::new(),
datalog: Default::default(),
auto_update: true,
notifications: Default::default(),
telemetry: Default::default(),
lsp: Default::default(),
auto_commit: false,
subagent: Default::default(),
vision_preprocessor_provider: None,
language: None,
ui: Default::default(),
plugin: Default::default(),
}
}
fn sample_image() -> ImagePart {
ImagePart {
media_type: "image/png".into(),
data: "iVBORw0KGgoAAAANSUhEUg==".into(),
}
}
struct StubProvider {
model: &'static str,
}
use crate::stream::StreamEvent;
use crate::tool::ToolDef;
use anyhow::Result;
use async_trait::async_trait;
use futures::Stream;
use std::pin::Pin;
#[async_trait]
impl LlmProvider for StubProvider {
fn chat_stream(
&self,
_messages: &[crate::conversation::message::Message],
_tools: Option<&[ToolDef]>,
) -> Result<Pin<Box<dyn Stream<Item = Result<StreamEvent>> + Send>>> {
anyhow::bail!("stub never streams");
}
fn model_name(&self) -> &str {
self.model
}
}
#[tokio::test]
async fn skipped_when_no_images() {
let cfg = blank_config();
let provider = StubProvider { model: "deepseek-v4-flash" };
let result = maybe_preprocess(&cfg, &provider, "any caption", &[]).await;
assert!(matches!(result, PreprocessOutcome::Skipped));
}
#[tokio::test]
async fn skipped_when_main_provider_accepts_images() {
let cfg = blank_config();
let provider = StubProvider { model: "claude-sonnet-4-5" };
let result =
maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
assert!(matches!(result, PreprocessOutcome::Skipped));
}
#[tokio::test]
async fn skipped_when_config_field_unset() {
let cfg = blank_config();
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
assert!(matches!(result, PreprocessOutcome::Skipped));
}
#[tokio::test]
async fn skipped_when_config_field_empty_string() {
let mut cfg = blank_config();
cfg.vision_preprocessor_provider = Some(String::new());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
assert!(matches!(result, PreprocessOutcome::Skipped));
}
#[tokio::test]
async fn failed_when_configured_key_missing_from_providers() {
let mut cfg = blank_config();
cfg.vision_preprocessor_provider = Some("AtomGit-NoSuchModel".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
match result {
PreprocessOutcome::Failed { reason } => {
assert!(
reason.contains("AtomGit-NoSuchModel") && reason.contains("not found"),
"expected 'not found' for missing key, got: {reason}",
);
}
other => panic!("expected Failed, got {other:?}"),
}
}
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
fn sse_one_token(text: &str) -> String {
let chunk = serde_json::json!({
"choices": [{
"delta": { "content": text },
"finish_reason": null,
}],
});
let done = serde_json::json!({
"choices": [{
"delta": {},
"finish_reason": "stop",
}],
});
format!("data: {}\n\ndata: {}\n\ndata: [DONE]\n\n", chunk, done)
}
fn vl_provider_cfg(base_url: &str) -> ProviderConfig {
ProviderConfig {
provider_type: "openai".into(),
api_key: Some("sk-test".into()),
model: "Qwen/Qwen3-VL-32B-Instruct".into(),
base_url: Some(base_url.to_string()),
system_prompt: None,
user_agent: None,
context_window: 8000,
max_tokens: None,
thinking_type: None,
thinking_keep: None,
reasoning_history: None,
thinking_enabled: None,
thinking_budget: None,
skip_tls_verify: false,
ephemeral: false,
}
}
#[tokio::test]
async fn replaced_when_vl_returns_text() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/chat/completions"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/event-stream")
.set_body_string(sse_one_token(
"Python stack trace showing ZeroDivisionError on line 42",
)),
)
.expect(1)
.mount(&server)
.await;
let mut cfg = blank_config();
cfg.providers.insert(
"vl".into(),
vl_provider_cfg(&server.uri()),
);
cfg.vision_preprocessor_provider = Some("vl".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "explain this", &[sample_image()]).await;
match result {
PreprocessOutcome::Replaced { text, vl_key } => {
assert_eq!(
text,
"Python stack trace showing ZeroDivisionError on line 42"
);
assert_eq!(vl_key, "vl", "Replaced must carry the configured key");
}
other => panic!("expected Replaced, got {other:?}"),
}
}
#[tokio::test]
async fn failed_when_vl_returns_500() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/chat/completions"))
.respond_with(ResponseTemplate::new(500).set_body_string("upstream error"))
.mount(&server)
.await;
let mut cfg = blank_config();
cfg.providers.insert(
"vl".into(),
vl_provider_cfg(&format!("{}/", server.uri())),
);
cfg.vision_preprocessor_provider = Some("vl".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "x", &[sample_image()]).await;
match result {
PreprocessOutcome::Failed { reason } => {
assert!(
reason.contains("VL call error") || reason.contains("500"),
"expected error reason mentioning failure, got: {reason}",
);
}
other => panic!("expected Failed, got {other:?}"),
}
}
#[tokio::test]
async fn failed_when_vl_returns_empty_string() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/chat/completions"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/event-stream")
.set_body_string(sse_one_token("")),
)
.mount(&server)
.await;
let mut cfg = blank_config();
cfg.providers.insert(
"vl".into(),
vl_provider_cfg(&format!("{}/", server.uri())),
);
cfg.vision_preprocessor_provider = Some("vl".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result =
maybe_preprocess(&cfg, &provider, "x", &[sample_image()]).await;
match result {
PreprocessOutcome::Failed { reason } => {
assert!(
reason.contains("empty"),
"expected 'empty' in reason, got: {reason}",
);
}
other => panic!("expected Failed for empty response, got {other:?}"),
}
}
use wiremock::Match;
struct BodyContains(String);
impl Match for BodyContains {
fn matches(&self, req: &wiremock::Request) -> bool {
String::from_utf8_lossy(&req.body).contains(&self.0)
}
}
struct BodyNotContains(String);
impl Match for BodyNotContains {
fn matches(&self, request: &wiremock::Request) -> bool {
!String::from_utf8_lossy(&request.body).contains(&self.0)
}
}
#[tokio::test]
async fn caption_is_included_in_vl_prompt() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/chat/completions"))
.and(BodyContains("用户的当前请求:解释这段代码".into()))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/event-stream")
.set_body_string(sse_one_token("ok")),
)
.expect(1)
.mount(&server)
.await;
let mut cfg = blank_config();
cfg.providers.insert(
"vl".into(),
vl_provider_cfg(&format!("{}/", server.uri())),
);
cfg.vision_preprocessor_provider = Some("vl".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result = maybe_preprocess(
&cfg,
&provider,
"解释这段代码",
&[sample_image()],
)
.await;
assert!(matches!(result, PreprocessOutcome::Replaced { .. }));
}
#[tokio::test]
async fn empty_caption_uses_pure_describe_prompt() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/chat/completions"))
.and(BodyContains("请详细描述这张图片的内容".into()))
.and(BodyNotContains("用户的当前请求:".into()))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/event-stream")
.set_body_string(sse_one_token("ok")),
)
.expect(1)
.mount(&server)
.await;
let mut cfg = blank_config();
cfg.providers.insert(
"vl".into(),
vl_provider_cfg(&format!("{}/", server.uri())),
);
cfg.vision_preprocessor_provider = Some("vl".into());
let provider = StubProvider { model: "deepseek-v4-flash" };
let result = maybe_preprocess(&cfg, &provider, " ", &[sample_image()]).await;
assert!(matches!(result, PreprocessOutcome::Replaced { .. }));
}
}