Skip to content

Multimodal

ZenMux supports multimodal inputs and can be used via multiple API protocols:

  • OpenAI Chat Completion API: Uses the image_url, file (PDF/video), and input_audio (audio) content types
  • OpenAI Responses API: Uses the input_image and input_file (PDF only) content types
  • Anthropic Messages API: Uses the image, document, audio, and video content types; supports both base64 and URL
  • Google Vertex AI API: Uses Part objects to pass images, files, audio, and video

Supported input types:

  • Text input
  • Image input
  • PDF input
  • Audio input
  • Video input

OpenAI Chat Completion API

Image Input

Use an image URL

python
from openai import OpenAI

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please analyze the content of the image."
                },
                {
                    "type": "image_url", 
                    "image_url": { 
                        "url": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
ts
import OpenAI from "openai";

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please analyze the content of the image.",
        },
        {
          type: "image_url", 
          image_url: {
            url: "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png", 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

Use Base64-encoded image

python
import base64
from openai import OpenAI

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

image_path = "path/to/your/image.jpg"
base64_image = encode_image_to_base64(image_path)
data_url = f"data:image/jpeg;base64,{base64_image}"

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please analyze the content of the image."
                },
                {
                    "type": "image_url", 
                    "image_url": { 
                        "url": data_url 
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
ts
import OpenAI from "openai";
import * as fs from "fs";

async function encodeImageToBase64(imagePath: string): Promise<string> {
  const imageBuffer = await fs.promises.readFile(imagePath);
  const base64Image = imageBuffer.toString("base64");
  return `data:image/jpeg;base64,${base64Image}`;
}

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const imagePath = "path/to/your/image.jpg";
const base64Image = await encodeImageToBase64(imagePath);

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please analyze the content of the image.",
        },
        {
          type: "image_url", 
          image_url: {
            url: base64Image, 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

PDF Input

Use a PDF URL

python
from openai import OpenAI

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please analyze the main content of the file."
                },
                {
                    "type": "file", 
                    "file": { 
                        "filename": "test.pdf", 
                        "file_data": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/06/uyZbd8m/xiaoxingxingzhaopengyou.pdf"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
typescript
import OpenAI from "openai";

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please analyze the main content of the file.",
        },
        {
          type: "file", 
          file: {
            filename: "test.pdf", 
            file_data:
              "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/06/uyZbd8m/xiaoxingxingzhaopengyou.pdf", 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

Use Base64-encoded PDF

python
import base64
from openai import OpenAI

def encode_pdf_to_base64(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode("utf-8")

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

pdf_path = "path/to/your/test.pdf"
base64_pdf = encode_pdf_to_base64(pdf_path)
data_url = f"data:application/pdf;base64,{base64_pdf}"

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please analyze the main content of the file."
                },
                {
                    "type": "file", 
                    "file": { 
                        "filename": "test.pdf", 
                        "file_data": data_url 
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
typescript
import OpenAI from "openai";
import * as fs from "fs";

async function encodePDFToBase64(pdfPath: string): Promise<string> {
  const pdfBuffer = await fs.promises.readFile(pdfPath);
  const base64PDF = pdfBuffer.toString("base64");
  return `data:application/pdf;base64,${base64PDF}`;
}

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const pdfPath = "path/to/your/test.pdf";
const base64PDF = await encodePDFToBase64(pdfPath);

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please analyze the main content of the file.",
        },
        {
          type: "file", 
          file: {
            filename: "test.pdf", 
            file_data: base64PDF, 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

Audio Input

Use the input_audio type to pass an audio file; Base64 encoding is required.

python
import base64
from openai import OpenAI

def encode_audio_to_base64(audio_path):
    with open(audio_path, "rb") as audio_file:
        return base64.b64encode(audio_file.read()).decode("utf-8")

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

audio_path = "path/to/your/audio.mp3"
base64_audio = encode_audio_to_base64(audio_path)

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please describe the content of this audio clip."
                },
                {
                    "type": "input_audio", 
                    "input_audio": { 
                        "data": base64_audio, 
                        "format": "mp3"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
typescript
import OpenAI from "openai";
import * as fs from "fs";

async function encodeAudioToBase64(audioPath: string): Promise<string> {
  const audioBuffer = await fs.promises.readFile(audioPath);
  return audioBuffer.toString("base64");
}

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const audioPath = "path/to/your/audio.mp3";
const base64Audio = await encodeAudioToBase64(audioPath);

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please describe the content of this audio clip.",
        },
        {
          type: "input_audio", 
          input_audio: {
            data: base64Audio, 
            format: "mp3", 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

Video Input

Use the file type to pass a video file. Both URL and Base64 encoding are supported.

Use a video URL

python
from openai import OpenAI

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please describe the content of this video."
                },
                {
                    "type": "file", 
                    "file": { 
                        "filename": "video.mp4", 
                        "file_data": "https://example.com/video.mp4"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
typescript
import OpenAI from "openai";

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please describe the content of this video.",
        },
        {
          type: "file", 
          file: {
            filename: "video.mp4", 
            file_data: "https://example.com/video.mp4", 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

Use Base64-encoded video

python
import base64
from openai import OpenAI

def encode_video_to_base64(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")

client = OpenAI(
    base_url="https://zenmux.ai/api/v1", 
    api_key="<your ZENMUX_API_KEY>", 
)

video_path = "path/to/your/video.mp4"
base64_video = encode_video_to_base64(video_path)
data_url = f"data:video/mp4;base64,{base64_video}"

response = client.chat.completions.create(
    model="google/gemini-2.5-pro",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Please describe the content of this video."
                },
                {
                    "type": "file", 
                    "file": { 
                        "filename": "video.mp4", 
                        "file_data": data_url 
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
typescript
import OpenAI from "openai";
import * as fs from "fs";

async function encodeVideoToBase64(videoPath: string): Promise<string> {
  const videoBuffer = await fs.promises.readFile(videoPath);
  const base64Video = videoBuffer.toString("base64");
  return `data:video/mp4;base64,${base64Video}`;
}

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1", 
  apiKey: "<your ZENMUX_API_KEY>", 
});

const videoPath = "path/to/your/video.mp4";
const base64Video = await encodeVideoToBase64(videoPath);

const response = await client.chat.completions.create({
  model: "google/gemini-2.5-pro",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Please describe the content of this video.",
        },
        {
          type: "file", 
          file: {
            filename: "video.mp4", 
            file_data: base64Video, 
          },
        },
      ],
    },
  ],
});

console.log(response.choices[0].message.content);

OpenAI Responses API

The Responses API uses the input_image and input_file content types to handle multimodal inputs.

Note

The Responses API currently supports image and PDF inputs only, and does not support audio or video. To process audio or video, use the Chat Completion API or the Vertex AI API.

Image Input

python
from openai import OpenAI

client = OpenAI(
    base_url="https://zenmux.ai/api/v1",
    api_key="<your ZENMUX_API_KEY>", 
)

response = client.responses.create(
    model="openai/gpt-5", 
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": "Please analyze the content of the image."
                },
                {
                    "type": "input_image", 
                    "image_url": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png"
                }
            ]
        }
    ]
)

# Extract the response
for item in response.output:
    if item.type == "message":
        for content in item.content:
            if content.type == "output_text":
                print(content.text)
typescript
import OpenAI from "openai";

const client = new OpenAI({
  baseURL: "https://zenmux.ai/api/v1",
  apiKey: "<your ZENMUX_API_KEY>", 
});

const response = await client.responses.create({
  model: "openai/gpt-5", 
  input: [
    {
      role: "user",
      content: [
        {
          type: "input_text",
          text: "Please analyze the content of the image.",
        },
        {
          type: "input_image", 
          image_url:
            "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png", 
        },
      ],
    },
  ],
});

// Extract the response
for (const item of response.output) {
  if (item.type === "message") {
    for (const content of item.content) {
      if (content.type === "output_text") {
        console.log(content.text);
      }
    }
  }
}
bash
curl https://zenmux.ai/api/v1/responses \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $ZENMUX_API_KEY" \
  -d '{
    "model": "openai/gpt-5",
    "input": [
      {
        "role": "user",
        "content": [
          {
            "type": "input_text",
            "text": "请分析一下图片的内容"
          },
          {
            "type": "input_image",
            "image_url": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png"
          }
        ]
      }
    ]
  }'

Use Base64 encoding

python
import base64

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

base64_image = encode_image_to_base64("path/to/image.jpg")

response = client.responses.create(
    model="openai/gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Describe this image."},
                {
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{base64_image}"
                }
            ]
        }
    ]
)

PDF Input

python
from openai import OpenAI

client = OpenAI(
    base_url="https://zenmux.ai/api/v1",
    api_key="<your ZENMUX_API_KEY>", 
)

response = client.responses.create(
    model="openai/gpt-5", 
    input=[
        {
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Please summarize the main content of this document."},
                {
                    "type": "input_file", 
                    "file_url": "https://www.example.com/document.pdf"
                }
            ]
        }
    ]
)

# Extract the response
for item in response.output:
    if item.type == "message":
        for content in item.content:
            if content.type == "output_text":
                print(content.text)
bash
curl https://zenmux.ai/api/v1/responses \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $ZENMUX_API_KEY" \
  -d '{
    "model": "openai/gpt-5",
    "input": [
      {
        "role": "user",
        "content": [
          {
            "type": "input_text",
            "text": "请总结这个文档的主要内容"
          },
          {
            "type": "input_file",
            "file_url": "https://www.example.com/document.pdf"
          }
        ]
      }
    ]
  }'

Anthropic Messages API

The Anthropic Messages API supports multimodal inputs using the image, document, audio, and video content types, and supports both base64 encoding and URL input.

Tip

With ZenMux protocol conversion, the Anthropic protocol can be routed to models that support audio and video (such as Gemini). When using multimodal-capable models, all input types are available.

Note: Audio and video inputs must use Google Cloud Storage gs:// URLs (for example, gs://cloud-samples-data/generative-ai/audio/pixel.mp3) to be processed correctly by Gemini models. If you need to use local files or other URLs, we recommend using the Vertex AI API protocol.

Supported formats

TypeSupported formats
ImageJPEG, PNG, GIF, WebP
DocumentPDF
AudioWAV, MP3, AIFF, AAC, OGG, FLAC
VideoMP4, AVI, MOV, MKV, WEBM, etc.

Use an image URL

python
import anthropic

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>", 
    base_url="https://zenmux.ai/api/anthropic"
)

message = client.messages.create(
    model="anthropic/claude-sonnet-4.5", 
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image", 
                    "source": { 
                        "type": "url", 
                        "url": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png"
                    }
                },
                {
                    "type": "text",
                    "text": "Please analyze the content of the image."
                }
            ]
        }
    ]
)

print(message.content[0].text)
typescript
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic({
  apiKey: "<your ZENMUX_API_KEY>", 
  baseURL: "https://zenmux.ai/api/anthropic", 
});

const message = await client.messages.create({
  model: "anthropic/claude-sonnet-4.5", 
  max_tokens: 1024,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "image", 
          source: {
            type: "url", 
            url: "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png", 
          },
        },
        {
          type: "text",
          text: "Please analyze the content of the image.",
        },
      ],
    },
  ],
});

console.log(message.content[0].text);
bash
curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "anthropic/claude-sonnet-4.5",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "image",
            "source": {
              "type": "url",
              "url": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png"
            }
          },
          {
            "type": "text",
            "text": "请分析一下图片的内容"
          }
        ]
      }
    ]
  }'

Use Base64 encoding

python
import anthropic
import base64

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>",
    base_url="https://zenmux.ai/api/anthropic"
)

base64_image = encode_image_to_base64("path/to/image.jpg")

message = client.messages.create(
    model="anthropic/claude-sonnet-4.5",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64", 
                        "media_type": "image/jpeg", 
                        "data": base64_image 
                    }
                },
                {
                    "type": "text",
                    "text": "Please analyze the content of the image."
                }
            ]
        }
    ]
)

print(message.content[0].text)
bash
# First, encode the image as base64
BASE64_IMAGE=$(base64 -i path/to/image.jpg)

curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "anthropic/claude-sonnet-4.5",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "image",
            "source": {
              "type": "base64",
              "media_type": "image/jpeg",
              "data": "'"$BASE64_IMAGE"'"
            }
          },
          {
            "type": "text",
            "text": "请分析一下图片的内容"
          }
        ]
      }
    ]
  }'

Multiple images

Claude supports analyzing multiple images in a single request:

python
message = client.messages.create(
    model="anthropic/claude-sonnet-4.5",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {"type": "url", "url": "https://example.com/image1.jpg"}
                },
                {
                    "type": "image",
                    "source": {"type": "url", "url": "https://example.com/image2.jpg"}
                },
                {
                    "type": "text",
                    "text": "Please compare the similarities and differences between these two images."
                }
            ]
        }
    ]
)

PDF Input

Use a PDF URL

python
import anthropic

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>",
    base_url="https://zenmux.ai/api/anthropic"
)

message = client.messages.create(
    model="anthropic/claude-sonnet-4.5",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document", 
                    "source": { 
                        "type": "url", 
                        "url": "https://example.com/document.pdf"
                    }
                },
                {
                    "type": "text",
                    "text": "Please summarize the main content of this document."
                }
            ]
        }
    ]
)

print(message.content[0].text)
typescript
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic({
  apiKey: "<your ZENMUX_API_KEY>",
  baseURL: "https://zenmux.ai/api/anthropic",
});

const message = await client.messages.create({
  model: "anthropic/claude-sonnet-4.5",
  max_tokens: 1024,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "document", 
          source: {
            type: "url", 
            url: "https://example.com/document.pdf", 
          },
        },
        {
          type: "text",
          text: "Please summarize the main content of this document.",
        },
      ],
    },
  ],
});

console.log(message.content[0].text);
bash
curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "anthropic/claude-sonnet-4.5",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "document",
            "source": {
              "type": "url",
              "url": "https://example.com/document.pdf"
            }
          },
          {
            "type": "text",
            "text": "请总结这个文档的主要内容"
          }
        ]
      }
    ]
  }'

Use Base64 encoding

python
import anthropic
import base64

def encode_pdf_to_base64(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode("utf-8")

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>",
    base_url="https://zenmux.ai/api/anthropic"
)

base64_pdf = encode_pdf_to_base64("path/to/document.pdf")

message = client.messages.create(
    model="anthropic/claude-sonnet-4.5",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {
                        "type": "base64", 
                        "media_type": "application/pdf", 
                        "data": base64_pdf 
                    }
                },
                {
                    "type": "text",
                    "text": "Please summarize the main content of this document."
                }
            ]
        }
    ]
)

print(message.content[0].text)
bash
# First, encode the PDF as base64
BASE64_PDF=$(base64 -i path/to/document.pdf)

curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "anthropic/claude-sonnet-4.5",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "document",
            "source": {
              "type": "base64",
              "media_type": "application/pdf",
              "data": "'"$BASE64_PDF"'"
            }
          },
          {
            "type": "text",
            "text": "请总结这个文档的主要内容"
          }
        ]
      }
    ]
  }'

Multiple documents

You can analyze multiple PDF documents in a single request:

python
message = client.messages.create(
    model="anthropic/claude-sonnet-4.5",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {"type": "url", "url": "https://example.com/document1.pdf"}
                },
                {
                    "type": "document",
                    "source": {"type": "url", "url": "https://example.com/document2.pdf"}
                },
                {
                    "type": "text",
                    "text": "Please compare the content of these two documents."
                }
            ]
        }
    ]
)

Audio Input

Supports multiple audio formats: WAV, MP3, AIFF, AAC, OGG, FLAC

python
import anthropic

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>",
    base_url="https://zenmux.ai/api/anthropic"
)

message = client.messages.create(
    model="google/gemini-2.5-pro", 
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "audio", 
                    "source": { 
                        "type": "url", 
                        "url": "gs://cloud-samples-data/generative-ai/audio/pixel.mp3"
                    }
                },
                {
                    "type": "text",
                    "text": "Please describe the content of this audio clip."
                }
            ]
        }
    ]
)

print(message.content[0].text)
typescript
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic({
  apiKey: "<your ZENMUX_API_KEY>",
  baseURL: "https://zenmux.ai/api/anthropic",
});

const message = await client.messages.create({
  model: "google/gemini-2.5-pro", 
  max_tokens: 1024,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "audio", 
          source: {
            type: "url", 
            url: "gs://cloud-samples-data/generative-ai/audio/pixel.mp3", 
          },
        },
        {
          type: "text",
          text: "Please describe the content of this audio clip.",
        },
      ],
    },
  ],
});

console.log(message.content[0].text);
bash
curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "google/gemini-2.5-pro",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "audio",
            "source": {
              "type": "url",
              "url": "gs://cloud-samples-data/generative-ai/audio/pixel.mp3"
            }
          },
          {
            "type": "text",
            "text": "请描述这段音频的内容"
          }
        ]
      }
    ]
  }'

Video Input

Supports multiple video formats: MP4, AVI, MOV, MKV, WEBM, etc.

python
import anthropic

client = anthropic.Anthropic(
    api_key="<your ZENMUX_API_KEY>",
    base_url="https://zenmux.ai/api/anthropic"
)

message = client.messages.create(
    model="google/gemini-2.5-pro", 
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video", 
                    "source": { 
                        "type": "url", 
                        "url": "gs://cloud-samples-data/video/animals.mp4"
                    }
                },
                {
                    "type": "text",
                    "text": "Please describe the content of this video."
                }
            ]
        }
    ]
)

print(message.content[0].text)
typescript
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic({
  apiKey: "<your ZENMUX_API_KEY>",
  baseURL: "https://zenmux.ai/api/anthropic",
});

const message = await client.messages.create({
  model: "google/gemini-2.5-pro", 
  max_tokens: 1024,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "video", 
          source: {
            type: "url", 
            url: "gs://cloud-samples-data/video/animals.mp4", 
          },
        },
        {
          type: "text",
          text: "Please describe the content of this video.",
        },
      ],
    },
  ],
});

console.log(message.content[0].text);
bash
curl https://zenmux.ai/api/anthropic/v1/messages \
  -H "x-api-key: $ZENMUX_API_KEY" \
  -H "anthropic-version: 2023-06-01" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "google/gemini-2.5-pro",
    "max_tokens": 1024,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "video",
            "source": {
              "type": "url",
              "url": "gs://cloud-samples-data/video/animals.mp4"
            }
          },
          {
            "type": "text",
            "text": "请描述这段视频的内容"
          }
        ]
      }
    ]
  }'

Google Vertex AI API

Vertex AI’s Gemini models use Part objects to pass multimodal content, supporting images, PDFs, videos, and more.

Supported formats

TypeSupported formats
ImagePNG, JPEG, WebP, HEIC, HEIF
DocumentPDF
AudioWAV, MP3, AIFF, AAC, OGG, FLAC
VideoMP4, AVI, MOV, MKV, WEBM, etc.

Image Input

python
from google import genai
from google.genai import types

client = genai.Client(
    api_key="<your ZENMUX_API_KEY>", 
    vertexai=True,
    http_options=types.HttpOptions(
        api_version='v1',
        base_url='https://zenmux.ai/api/vertex-ai'
    ),
)

# Use an image URL
response = client.models.generate_content(
    model="google/gemini-2.5-pro", 
    contents=[
        types.Part.from_uri( 
            file_uri="https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png",
            mime_type="image/png"
        ),
        "Please analyze the content of the image."
    ]
)

print(response.text)
typescript
import { GoogleGenAI } from "@google/genai";

const client = new GoogleGenAI({
  apiKey: "<your ZENMUX_API_KEY>", 
  vertexai: true,
  httpOptions: {
    baseUrl: "https://zenmux.ai/api/vertex-ai", 
    apiVersion: "v1",
  },
});

const response = await client.models.generateContent({
  model: "google/gemini-2.5-pro", 
  contents: [
    {
      role: "user",
      parts: [
        {
          fileData: {
            fileUri:
              "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png", 
            mimeType: "image/png", 
          },
        },
        { text: "Please analyze the content of the image." },
      ],
    },
  ],
});

console.log(response.text);
bash
curl https://zenmux.ai/api/vertex-ai/v1/projects/PROJECT_ID/locations/LOCATION/publishers/google/models/gemini-2.5-pro:generateContent \
  -H "Authorization: Bearer $ZENMUX_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "contents": [{
      "role": "user",
      "parts": [
        {
          "fileData": {
            "fileUri": "https://cdn.marmot-cloud.com/storage/tbox-router/2025/08/05/e9445SU/shengchengtupian2025-04-09-19_31.png",
            "mimeType": "image/png"
          }
        },
        {"text": "请分析一下图片的内容"}
      ]
    }]
  }'

Use Base64 encoding

python
import base64
from google import genai
from google.genai import types

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

client = genai.Client(
    api_key="<your ZENMUX_API_KEY>",
    vertexai=True,
    http_options=types.HttpOptions(
        api_version='v1',
        base_url='https://zenmux.ai/api/vertex-ai'
    ),
)

base64_image = encode_image_to_base64("path/to/image.jpg")

response = client.models.generate_content(
    model="google/gemini-2.5-pro",
    contents=[
        types.Part.from_bytes( 
            data=base64.b64decode(base64_image), 
            mime_type="image/jpeg"
        ),
        "Please analyze the content of the image."
    ]
)

print(response.text)

PDF Input

python
# Use a PDF URL
response = client.models.generate_content(
    model="google/gemini-2.5-pro",
    contents=[
        types.Part.from_uri(
            file_uri="https://example.com/document.pdf",
            mime_type="application/pdf"
        ),
        "Please summarize the main content of this document."
    ]
)

print(response.text)

Multiple images

python
response = client.models.generate_content(
    model="google/gemini-2.5-pro",
    contents=[
        types.Part.from_uri(
            file_uri="https://example.com/image1.jpg",
            mime_type="image/jpeg"
        ),
        types.Part.from_uri(
            file_uri="https://example.com/image2.jpg",
            mime_type="image/jpeg"
        ),
        "Please compare the similarities and differences between these two images."
    ]
)

Audio Input

Gemini supports multiple audio formats: WAV, MP3, AIFF, AAC, OGG, FLAC

python
from google import genai
from google.genai import types

client = genai.Client(
    api_key="<your ZENMUX_API_KEY>",
    vertexai=True,
    http_options=types.HttpOptions(
        api_version='v1',
        base_url='https://zenmux.ai/api/vertex-ai'
    ),
)

# Use an audio URL
response = client.models.generate_content(
    model="google/gemini-2.5-pro",
    contents=[
        types.Part.from_uri(
            file_uri="https://example.com/audio.mp3", 
            mime_type="audio/mp3"
        ),
        "Please describe the content of this audio clip."
    ]
)

print(response.text)
bash
curl https://zenmux.ai/api/vertex-ai/v1/projects/PROJECT_ID/locations/LOCATION/publishers/google/models/gemini-2.5-pro:generateContent \
  -H "Authorization: Bearer $ZENMUX_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "contents": [{
      "role": "user",
      "parts": [
        {
          "fileData": {
            "fileUri": "https://example.com/audio.mp3",
            "mimeType": "audio/mp3"
          }
        },
        {"text": "请描述这段音频的内容"}
      ]
    }]
  }'

Video Input

Gemini supports multiple video formats: MP4, AVI, MOV, MKV, WEBM, etc.

python
from google import genai
from google.genai import types

client = genai.Client(
    api_key="<your ZENMUX_API_KEY>",
    vertexai=True,
    http_options=types.HttpOptions(
        api_version='v1',
        base_url='https://zenmux.ai/api/vertex-ai'
    ),
)

# Use a video URL
response = client.models.generate_content(
    model="google/gemini-2.5-pro",
    contents=[
        types.Part.from_uri(
            file_uri="https://example.com/video.mp4", 
            mime_type="video/mp4"
        ),
        "Please describe the content of this video."
    ]
)

print(response.text)
bash
curl https://zenmux.ai/api/vertex-ai/v1/projects/PROJECT_ID/locations/LOCATION/publishers/google/models/gemini-2.5-pro:generateContent \
  -H "Authorization: Bearer $ZENMUX_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "contents": [{
      "role": "user",
      "parts": [
        {
          "fileData": {
            "fileUri": "https://example.com/video.mp4",
            "mimeType": "video/mp4"
          }
        },
        {"text": "请描述这段视频的内容"}
      ]
    }]
  }'

Protocol Comparison

FeatureChat CompletionResponses APIAnthropic MessagesVertex AI
Image type nameimage_urlinput_imageimagePart
URL supporturl fieldimage_url fieldtype: "url"file_uri
Base64 support✅ data URL✅ data URLtype: "base64"from_bytes
PDF supportfile typeinput_filedocument typemime_type
Audio supportinput_audio type❌ Not supportedaudio typeaudio/*
Video supportfile type❌ Not supportedvideo typevideo/*
Multiple images✅ Up to 100 images✅ Up to 3000 images