From bdb2bab0d6b26e2047660f1830509753491c9b77 Mon Sep 17 00:00:00 2001 From: Kyle Hanks Date: Tue, 27 Jan 2026 13:47:15 -0800 Subject: [PATCH] feat(rig-openai-responses): add image support for user messages Convert rig Image types to OpenAI InputImageContent format, handling base64, URL, and raw byte sources with proper media type and detail level mapping. --- backend/Cargo.lock | 69 +++--- .../crates/rig-openai-responses/Cargo.toml | 1 + .../rig-openai-responses/src/completion.rs | 202 ++++++++++++++++-- 3 files changed, 216 insertions(+), 56 deletions(-) diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 986ec0b1..558b4dfd 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -5673,7 +5673,7 @@ dependencies = [ [[package]] name = "qbit" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5757,7 +5757,7 @@ dependencies = [ [[package]] name = "qbit-ai" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5802,7 +5802,7 @@ dependencies = [ [[package]] name = "qbit-artifacts" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "chrono", @@ -5817,7 +5817,7 @@ dependencies = [ [[package]] name = "qbit-ast-grep" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "ast-grep-core", @@ -5834,7 +5834,7 @@ dependencies = [ [[package]] name = "qbit-benchmarks" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5848,7 +5848,7 @@ dependencies = [ [[package]] name = "qbit-cli-output" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "qbit-core", @@ -5859,7 +5859,7 @@ dependencies = [ [[package]] name = "qbit-context" -version = "0.2.12" +version = "0.2.13" dependencies = [ "chrono", "rig-core 0.29.0", @@ -5871,7 +5871,7 @@ dependencies = [ [[package]] name = "qbit-core" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5889,7 +5889,7 @@ dependencies = [ [[package]] name = "qbit-directory-ops" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5907,7 +5907,7 @@ dependencies = [ [[package]] name = "qbit-evals" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5933,7 +5933,7 @@ dependencies = [ [[package]] name = "qbit-file-ops" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5949,7 +5949,7 @@ dependencies = [ [[package]] name = "qbit-hitl" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "chrono", @@ -5963,7 +5963,7 @@ dependencies = [ [[package]] name = "qbit-indexer" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "dirs 5.0.1", @@ -5976,7 +5976,7 @@ dependencies = [ [[package]] name = "qbit-llm-providers" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -5995,7 +5995,7 @@ dependencies = [ [[package]] name = "qbit-loop-detection" -version = "0.2.12" +version = "0.2.13" dependencies = [ "chrono", "serde", @@ -6006,7 +6006,7 @@ dependencies = [ [[package]] name = "qbit-models" -version = "0.2.12" +version = "0.2.13" dependencies = [ "once_cell", "qbit-settings", @@ -6017,7 +6017,7 @@ dependencies = [ [[package]] name = "qbit-planner" -version = "0.2.12" +version = "0.2.13" dependencies = [ "chrono", "proptest", @@ -6031,7 +6031,7 @@ dependencies = [ [[package]] name = "qbit-pty" -version = "0.2.12" +version = "0.2.13" dependencies = [ "dirs 5.0.1", "itoa", @@ -6051,7 +6051,7 @@ dependencies = [ [[package]] name = "qbit-runtime" -version = "0.2.12" +version = "0.2.13" dependencies = [ "async-trait", "atty", @@ -6066,7 +6066,7 @@ dependencies = [ [[package]] name = "qbit-session" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "chrono", @@ -6083,7 +6083,7 @@ dependencies = [ [[package]] name = "qbit-settings" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "dirs 5.0.1", @@ -6098,7 +6098,7 @@ dependencies = [ [[package]] name = "qbit-shell-exec" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6113,7 +6113,7 @@ dependencies = [ [[package]] name = "qbit-sidecar" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6141,7 +6141,7 @@ dependencies = [ [[package]] name = "qbit-skills" -version = "0.2.12" +version = "0.2.13" dependencies = [ "dirs 5.0.1", "serde", @@ -6154,7 +6154,7 @@ dependencies = [ [[package]] name = "qbit-sub-agents" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6177,7 +6177,7 @@ dependencies = [ [[package]] name = "qbit-swebench" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6204,7 +6204,7 @@ dependencies = [ [[package]] name = "qbit-synthesis" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6220,7 +6220,7 @@ dependencies = [ [[package]] name = "qbit-tool-policy" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "dirs 5.0.1", @@ -6234,7 +6234,7 @@ dependencies = [ [[package]] name = "qbit-tools" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6259,11 +6259,11 @@ dependencies = [ [[package]] name = "qbit-udiff" -version = "0.2.12" +version = "0.2.13" [[package]] name = "qbit-web" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6280,7 +6280,7 @@ dependencies = [ [[package]] name = "qbit-workflow" -version = "0.2.12" +version = "0.2.13" dependencies = [ "anyhow", "async-trait", @@ -6907,7 +6907,7 @@ checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" [[package]] name = "rig-anthropic-vertex" -version = "0.2.12" +version = "0.2.13" dependencies = [ "base64 0.22.1", "bytes", @@ -6989,6 +6989,7 @@ name = "rig-openai-responses" version = "0.1.0" dependencies = [ "async-openai", + "base64 0.22.1", "futures", "rig-core 0.29.0", "serde", @@ -7015,7 +7016,7 @@ dependencies = [ [[package]] name = "rig-zai-sdk" -version = "0.2.12" +version = "0.2.13" dependencies = [ "async-stream", "bytes", diff --git a/backend/crates/rig-openai-responses/Cargo.toml b/backend/crates/rig-openai-responses/Cargo.toml index 16f74411..648dcb93 100644 --- a/backend/crates/rig-openai-responses/Cargo.toml +++ b/backend/crates/rig-openai-responses/Cargo.toml @@ -21,3 +21,4 @@ serde_json = "1.0" # Utilities tracing = "0.1" thiserror = "2.0" +base64 = "0.22" diff --git a/backend/crates/rig-openai-responses/src/completion.rs b/backend/crates/rig-openai-responses/src/completion.rs index 91794990..a1b72de6 100644 --- a/backend/crates/rig-openai-responses/src/completion.rs +++ b/backend/crates/rig-openai-responses/src/completion.rs @@ -2,9 +2,10 @@ use async_openai::config::OpenAIConfig; use async_openai::types::responses::{ - CreateResponse, EasyInputContent, EasyInputMessage, FunctionTool, InputItem, InputParam, - MessageType, OutputItem, OutputMessageContent, Reasoning, ReasoningEffort as OAReasoningEffort, - ReasoningSummary, Response, ResponseStreamEvent, Role, SummaryPart, Tool, + CreateResponse, EasyInputContent, EasyInputMessage, FunctionTool, ImageDetail, InputContent, + InputImageContent, InputItem, InputParam, InputTextContent, MessageType, OutputItem, + OutputMessageContent, Reasoning, ReasoningEffort as OAReasoningEffort, ReasoningSummary, + Response, ResponseStreamEvent, Role, SummaryPart, Tool, }; use async_openai::Client as OpenAIClient; use futures::StreamExt; @@ -132,12 +133,11 @@ impl CompletionModel { for msg in request.chat_history.iter() { match msg { Message::User { content } => { - let text = extract_user_text(content); - if !text.is_empty() { + if let Some(easy_content) = convert_user_content(content) { input_items.push(InputItem::EasyMessage(EasyInputMessage { r#type: MessageType::Message, role: Role::User, - content: EasyInputContent::Text(text), + content: easy_content, })); } } @@ -545,12 +545,101 @@ fn map_stream_event( // Conversion Helpers // ============================================================================ -/// Extract text content from user message content. -fn extract_user_text(content: &OneOrMany) -> String { - content - .iter() - .filter_map(|c| match c { - UserContent::Text(text) => Some(text.text.clone()), +/// Convert user content to OpenAI EasyInputContent, handling text and images. +/// +/// Returns `EasyInputContent::Text` for text-only messages, or +/// `EasyInputContent::ContentList` for messages containing images. +fn convert_user_content(content: &OneOrMany) -> Option { + use base64::Engine; + + let mut has_images = false; + let mut input_parts: Vec = Vec::new(); + + for c in content.iter() { + match c { + UserContent::Text(text) => { + if !text.text.is_empty() { + input_parts.push(InputContent::InputText(InputTextContent { + text: text.text.clone(), + })); + } + } + UserContent::Image(img) => { + // Convert rig Image to OpenAI InputImageContent + let image_url = match &img.data { + rig::message::DocumentSourceKind::Base64(b64) => { + // Already base64, construct data URL + let media_type = img + .media_type + .as_ref() + .map(|mt| { + use rig::message::ImageMediaType; + match mt { + ImageMediaType::PNG => "image/png", + ImageMediaType::JPEG => "image/jpeg", + ImageMediaType::GIF => "image/gif", + ImageMediaType::WEBP => "image/webp", + ImageMediaType::HEIC => "image/heic", + ImageMediaType::HEIF => "image/heif", + ImageMediaType::SVG => "image/svg+xml", + } + }) + .unwrap_or("image/png"); + format!("data:{};base64,{}", media_type, b64) + } + rig::message::DocumentSourceKind::Url(url) => { + // Direct URL + url.clone() + } + rig::message::DocumentSourceKind::Raw(bytes) => { + // Raw bytes, encode to base64 + let b64 = base64::engine::general_purpose::STANDARD.encode(bytes); + let media_type = img + .media_type + .as_ref() + .map(|mt| { + use rig::message::ImageMediaType; + match mt { + ImageMediaType::PNG => "image/png", + ImageMediaType::JPEG => "image/jpeg", + ImageMediaType::GIF => "image/gif", + ImageMediaType::WEBP => "image/webp", + ImageMediaType::HEIC => "image/heic", + ImageMediaType::HEIF => "image/heif", + ImageMediaType::SVG => "image/svg+xml", + } + }) + .unwrap_or("image/png"); + format!("data:{};base64,{}", media_type, b64) + } + // Handle any future variants added to this non-exhaustive enum + _ => { + tracing::warn!("Unsupported image source kind, skipping"); + continue; + } + }; + + // Convert rig ImageDetail to async-openai ImageDetail + let detail = img + .detail + .as_ref() + .map(|d| { + use rig::message::ImageDetail as RigImageDetail; + match d { + RigImageDetail::Auto => ImageDetail::Auto, + RigImageDetail::High => ImageDetail::High, + RigImageDetail::Low => ImageDetail::Low, + } + }) + .unwrap_or(ImageDetail::Auto); + + input_parts.push(InputContent::InputImage(InputImageContent { + detail, + file_id: None, + image_url: Some(image_url), + })); + has_images = true; + } UserContent::ToolResult(result) => { // Extract text from tool result content let result_text = result @@ -565,16 +654,42 @@ fn extract_user_text(content: &OneOrMany) -> String { }) .collect::>() .join("\n"); - if result_text.is_empty() { - None - } else { - Some(format!("[Tool result for {}]: {}", result.id, result_text)) + if !result_text.is_empty() { + input_parts.push(InputContent::InputText(InputTextContent { + text: format!("[Tool result for {}]: {}", result.id, result_text), + })); } } - _ => None, - }) - .collect::>() - .join("\n") + // Skip other content types (Audio, Video, Document) not supported yet + _ => { + tracing::debug!("Skipping unsupported user content type"); + } + } + } + + if input_parts.is_empty() { + return None; + } + + // If we have images, we must use ContentList format + // If text-only, we can use the simpler Text format + if has_images { + Some(EasyInputContent::ContentList(input_parts)) + } else { + // For text-only, join all text parts + let text = input_parts + .into_iter() + .filter_map(|p| { + if let InputContent::InputText(t) = p { + Some(t.text) + } else { + None + } + }) + .collect::>() + .join("\n"); + Some(EasyInputContent::Text(text)) + } } /// Extract text content from assistant message content. @@ -632,11 +747,54 @@ mod tests { } #[test] - fn test_extract_user_text() { + fn test_convert_user_content_text_only() { let content = OneOrMany::one(UserContent::Text(Text { text: "Hello, world!".to_string(), })); - assert_eq!(extract_user_text(&content), "Hello, world!"); + let result = convert_user_content(&content); + assert!(result.is_some()); + match result.unwrap() { + EasyInputContent::Text(text) => assert_eq!(text, "Hello, world!"), + _ => panic!("Expected Text variant"), + } + } + + #[test] + fn test_convert_user_content_with_image() { + use rig::message::{DocumentSourceKind, Image, ImageMediaType}; + + let content = OneOrMany::many(vec![ + UserContent::Text(Text { + text: "What's in this image?".to_string(), + }), + UserContent::Image(Image { + data: DocumentSourceKind::Base64("dGVzdA==".to_string()), + media_type: Some(ImageMediaType::PNG), + detail: None, + additional_params: None, + }), + ]) + .unwrap(); + let result = convert_user_content(&content); + assert!(result.is_some()); + match result.unwrap() { + EasyInputContent::ContentList(parts) => { + assert_eq!(parts.len(), 2); + match &parts[0] { + InputContent::InputText(t) => { + assert_eq!(t.text, "What's in this image?") + } + _ => panic!("Expected InputText"), + } + match &parts[1] { + InputContent::InputImage(img) => { + assert!(img.image_url.as_ref().unwrap().starts_with("data:image/png;base64,")); + } + _ => panic!("Expected InputImage"), + } + } + _ => panic!("Expected ContentList variant"), + } } #[test]