Enhance read_image tool with magic byte detection and multi-image support

- Fix media type detection using magic bytes instead of file extension
  - Correctly identifies JPEG files with .png extension (and vice versa)
  - Supports PNG, JPEG, GIF, and WebP formats

- Add multi-image support with file_paths array parameter
  - Load multiple images in a single tool call
  - All images queued for LLM analysis

- Enhanced CLI output:
  - Inline image preview via iTerm2 imgcat protocol (height=5)
  - Dimmed info line showing: path | dimensions | media type | file size
  - Proper │ prefix alignment with tool output boxing
  - Human-readable file sizes (bytes, KB, MB)

- Add image dimension extraction from file headers
  - PNG, JPEG, GIF, WebP dimension parsing

- Add comprehensive tests for magic byte detection and dimensions
This commit is contained in:
Dhanji R. Prasanna
2025-12-26 11:19:37 +11:00
parent 3ece02ff31
commit 3601cc0547
7 changed files with 521 additions and 9 deletions

View File

@@ -274,15 +274,29 @@ impl AnthropicProvider {
}
}
MessageRole::User => {
// Build content blocks - images first, then text
let mut content_blocks: Vec<AnthropicContent> = Vec::new();
// Add any images attached to this message
for image in &message.images {
content_blocks.push(AnthropicContent::Image {
source: AnthropicImageSource {
source_type: "base64".to_string(),
media_type: image.media_type.clone(),
data: image.data.clone(),
},
});
}
// Add text content
content_blocks.push(AnthropicContent::Text {
text: message.content.clone(),
cache_control: message.cache_control.as_ref().map(Self::convert_cache_control),
});
anthropic_messages.push(AnthropicMessage {
role: "user".to_string(),
content: vec![AnthropicContent::Text {
text: message.content.clone(),
cache_control: message
.cache_control
.as_ref()
.map(Self::convert_cache_control),
}],
content: content_blocks,
});
}
MessageRole::Assistant => {
@@ -924,6 +938,19 @@ enum AnthropicContent {
name: String,
input: serde_json::Value,
},
#[serde(rename = "image")]
Image {
source: AnthropicImageSource,
},
}
/// Image source for Anthropic API
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AnthropicImageSource {
#[serde(rename = "type")]
source_type: String, // Always "base64"
media_type: String, // e.g., "image/png", "image/jpeg"
data: String, // Base64-encoded image data
}
#[derive(Debug, Deserialize)]

View File

@@ -88,6 +88,8 @@ pub struct Message {
pub role: MessageRole,
pub content: String,
#[serde(skip)]
pub images: Vec<ImageContent>,
#[serde(skip)]
pub id: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
@@ -101,6 +103,65 @@ pub enum MessageRole {
Assistant,
}
/// Image content for multimodal messages
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageContent {
/// Media type (e.g., "image/png", "image/jpeg", "image/gif", "image/webp")
pub media_type: String,
/// Base64-encoded image data
pub data: String,
}
impl ImageContent {
pub fn new(media_type: &str, data: String) -> Self {
Self {
media_type: media_type.to_string(),
data,
}
}
/// Detect media type from file extension
pub fn media_type_from_extension(ext: &str) -> Option<&'static str> {
match ext.to_lowercase().as_str() {
"png" => Some("image/png"),
"jpg" | "jpeg" => Some("image/jpeg"),
"gif" => Some("image/gif"),
"webp" => Some("image/webp"),
_ => None,
}
}
/// Detect media type from image data magic bytes (file signature)
/// This is more reliable than file extension as it checks actual content
pub fn media_type_from_bytes(bytes: &[u8]) -> Option<&'static str> {
if bytes.len() < 12 {
return None;
}
// PNG: 89 50 4E 47 0D 0A 1A 0A
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
return Some("image/png");
}
// JPEG: FF D8 FF
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
return Some("image/jpeg");
}
// GIF: 47 49 46 38 (GIF8)
if bytes.starts_with(&[0x47, 0x49, 0x46, 0x38]) {
return Some("image/gif");
}
// WebP: 52 49 46 46 ... 57 45 42 50 (RIFF....WEBP)
if bytes.starts_with(&[0x52, 0x49, 0x46, 0x46]) && bytes.len() >= 12 && &bytes[8..12] == b"WEBP" {
return Some("image/webp");
}
None
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompletionResponse {
pub content: String,
@@ -174,6 +235,7 @@ impl Message {
Self {
role,
content,
images: Vec::new(),
id: Self::generate_id(),
cache_control: None,
}
@@ -188,6 +250,7 @@ impl Message {
Self {
role,
content,
images: Vec::new(),
id: Self::generate_id(),
cache_control: Some(cache_control),
}