Gemini 模型從一開始就建構於多模態的基礎上,因此可執行各種圖像處理和電腦視覺工作,包括但不限於生成圖像說明、分類和回答圖像問題,不必訓練專門的機器學習模型。
將圖片傳送給 Gemini
你可以透過下列兩種方式,將圖片做為 Gemini 的輸入內容:
- 傳遞內嵌圖片資料:適合較小的檔案 (包括提示在內,要求總大小小於 20 MB)。
- 使用 File API 上傳圖片:建議用於較大的檔案,或在多項要求中重複使用圖片。
傳遞內嵌圖片資料
您可以在對 generateContent
的要求中傳遞內嵌圖片資料。您可以提供 Base64 編碼字串形式的圖片資料,也可以直接讀取本機檔案 (視語言而定)。
以下範例說明如何從本機檔案讀取圖片,並傳遞至 generateContent
API 進行處理。
Python
from google.genai import types
with open('path/to/small-sample.jpg', 'rb') as f:
image_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=[
types.Part.from_bytes(
data=image_bytes,
mime_type='image/jpeg',
),
'Caption this image.'
]
)
print(response.text)
JavaScript
import { GoogleGenAI } from "@google/genai";
import * as fs from "node:fs";
const ai = new GoogleGenAI({});
const base64ImageFile = fs.readFileSync("path/to/small-sample.jpg", {
encoding: "base64",
});
const contents = [
{
inlineData: {
mimeType: "image/jpeg",
data: base64ImageFile,
},
},
{ text: "Caption this image." },
];
const response = await ai.models.generateContent({
model: "gemini-2.5-flash",
contents: contents,
});
console.log(response.text);
Go
bytes, _ := os.ReadFile("path/to/small-sample.jpg")
parts := []*genai.Part{
genai.NewPartFromBytes(bytes, "image/jpeg"),
genai.NewPartFromText("Caption this image."),
}
contents := []*genai.Content{
genai.NewContentFromParts(parts, genai.RoleUser),
}
result, _ := client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
REST
IMG_PATH="/path/to/your/image1.jpg"
if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then
B64FLAGS="--input"
else
B64FLAGS="-w0"
fi
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[
{
"inline_data": {
"mime_type":"image/jpeg",
"data": "'"$(base64 $B64FLAGS $IMG_PATH)"'"
}
},
{"text": "Caption this image."},
]
}]
}' 2> /dev/null
您也可以從網址擷取圖片、轉換為位元組,然後傳遞至 generateContent
,如以下範例所示。
Python
from google import genai
from google.genai import types
import requests
image_path = "https://goo.gle/instrument-img"
image_bytes = requests.get(image_path).content
image = types.Part.from_bytes(
data=image_bytes, mime_type="image/jpeg"
)
client = genai.Client()
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=["What is this image?", image],
)
print(response.text)
JavaScript
import { GoogleGenAI } from "@google/genai";
async function main() {
const ai = new GoogleGenAI({});
const imageUrl = "https://goo.gle/instrument-img";
const response = await fetch(imageUrl);
const imageArrayBuffer = await response.arrayBuffer();
const base64ImageData = Buffer.from(imageArrayBuffer).toString('base64');
const result = await ai.models.generateContent({
model: "gemini-2.5-flash",
contents: [
{
inlineData: {
mimeType: 'image/jpeg',
data: base64ImageData,
},
},
{ text: "Caption this image." }
],
});
console.log(result.text);
}
main();
Go
package main
import (
"context"
"fmt"
"os"
"io"
"net/http"
"google.golang.org/genai"
)
func main() {
ctx := context.Background()
client, err := genai.NewClient(ctx, nil)
if err != nil {
log.Fatal(err)
}
// Download the image.
imageResp, _ := http.Get("https://goo.gle/instrument-img")
imageBytes, _ := io.ReadAll(imageResp.Body)
parts := []*genai.Part{
genai.NewPartFromBytes(imageBytes, "image/jpeg"),
genai.NewPartFromText("Caption this image."),
}
contents := []*genai.Content{
genai.NewContentFromParts(parts, genai.RoleUser),
}
result, _ := client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
}
REST
IMG_URL="https://goo.gle/instrument-img"
MIME_TYPE=$(curl -sIL "$IMG_URL" | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1)
if [[ -z "$MIME_TYPE" || ! "$MIME_TYPE" == image/* ]]; then
MIME_TYPE="image/jpeg"
fi
# Check for macOS
if [[ "$(uname)" == "Darwin" ]]; then
IMAGE_B64=$(curl -sL "$IMG_URL" | base64 -b 0)
elif [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then
IMAGE_B64=$(curl -sL "$IMG_URL" | base64)
else
IMAGE_B64=$(curl -sL "$IMG_URL" | base64 -w0)
fi
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[
{
"inline_data": {
"mime_type":"'"$MIME_TYPE"'",
"data": "'"$IMAGE_B64"'"
}
},
{"text": "Caption this image."}
]
}]
}' 2> /dev/null
使用 File API 上傳圖片
如要上傳大型檔案或重複使用同一張圖片,請使用 Files API。下列程式碼會上傳圖片檔案,然後在呼叫 generateContent
時使用該檔案。如需詳細資訊和範例,請參閱 Files API 指南。
Python
from google import genai
client = genai.Client()
my_file = client.files.upload(file="path/to/sample.jpg")
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[my_file, "Caption this image."],
)
print(response.text)
JavaScript
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
const ai = new GoogleGenAI({});
async function main() {
const myfile = await ai.files.upload({
file: "path/to/sample.jpg",
config: { mimeType: "image/jpeg" },
});
const response = await ai.models.generateContent({
model: "gemini-2.5-flash",
contents: createUserContent([
createPartFromUri(myfile.uri, myfile.mimeType),
"Caption this image.",
]),
});
console.log(response.text);
}
await main();
Go
package main
import (
"context"
"fmt"
"os"
"google.golang.org/genai"
)
func main() {
ctx := context.Background()
client, err := genai.NewClient(ctx, nil)
if err != nil {
log.Fatal(err)
}
uploadedFile, _ := client.Files.UploadFromPath(ctx, "path/to/sample.jpg", nil)
parts := []*genai.Part{
genai.NewPartFromText("Caption this image."),
genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
}
contents := []*genai.Content{
genai.NewContentFromParts(parts, genai.RoleUser),
}
result, _ := client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
}
REST
IMAGE_PATH="path/to/sample.jpg"
MIME_TYPE=$(file -b --mime-type "${IMAGE_PATH}")
NUM_BYTES=$(wc -c < "${IMAGE_PATH}")
DISPLAY_NAME=IMAGE
tmp_header_file=upload-header.tmp
# Initial resumable request defining metadata.
# The upload url is in the response headers dump them to a file.
curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-D upload-header.tmp \
-H "X-Goog-Upload-Protocol: resumable" \
-H "X-Goog-Upload-Command: start" \
-H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \
-H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \
-H "Content-Type: application/json" \
-d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null
upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "\r")
rm "${tmp_header_file}"
# Upload the actual bytes.
curl "${upload_url}" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-H "Content-Length: ${NUM_BYTES}" \
-H "X-Goog-Upload-Offset: 0" \
-H "X-Goog-Upload-Command: upload, finalize" \
--data-binary "@${IMAGE_PATH}" 2> /dev/null > file_info.json
file_uri=$(jq -r ".file.uri" file_info.json)
echo file_uri=$file_uri
# Now generate content using that file
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[
{"file_data":{"mime_type": "'"${MIME_TYPE}"'", "file_uri": "'"${file_uri}"'"}},
{"text": "Caption this image."}]
}]
}' 2> /dev/null > response.json
cat response.json
echo
jq ".candidates[].content.parts[].text" response.json
使用多張圖片撰寫提示
您可以在 contents
陣列中加入多個圖片 Part
物件,在單一提示中提供多張圖片。這些可以是內嵌資料 (本機檔案或網址) 和 File API 參照的組合。
Python
from google import genai
from google.genai import types
client = genai.Client()
# Upload the first image
image1_path = "path/to/image1.jpg"
uploaded_file = client.files.upload(file=image1_path)
# Prepare the second image as inline data
image2_path = "path/to/image2.png"
with open(image2_path, 'rb') as f:
img2_bytes = f.read()
# Create the prompt with text and multiple images
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[
"What is different between these two images?",
uploaded_file, # Use the uploaded file reference
types.Part.from_bytes(
data=img2_bytes,
mime_type='image/png'
)
]
)
print(response.text)
JavaScript
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
import * as fs from "node:fs";
const ai = new GoogleGenAI({});
async function main() {
// Upload the first image
const image1_path = "path/to/image1.jpg";
const uploadedFile = await ai.files.upload({
file: image1_path,
config: { mimeType: "image/jpeg" },
});
// Prepare the second image as inline data
const image2_path = "path/to/image2.png";
const base64Image2File = fs.readFileSync(image2_path, {
encoding: "base64",
});
// Create the prompt with text and multiple images
const response = await ai.models.generateContent({
model: "gemini-2.5-flash",
contents: createUserContent([
"What is different between these two images?",
createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
{
inlineData: {
mimeType: "image/png",
data: base64Image2File,
},
},
]),
});
console.log(response.text);
}
await main();
Go
// Upload the first image
image1Path := "path/to/image1.jpg"
uploadedFile, _ := client.Files.UploadFromPath(ctx, image1Path, nil)
// Prepare the second image as inline data
image2Path := "path/to/image2.jpeg"
imgBytes, _ := os.ReadFile(image2Path)
parts := []*genai.Part{
genai.NewPartFromText("What is different between these two images?"),
genai.NewPartFromBytes(imgBytes, "image/jpeg"),
genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
}
contents := []*genai.Content{
genai.NewContentFromParts(parts, genai.RoleUser),
}
result, _ := client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
REST
# Upload the first image
IMAGE1_PATH="path/to/image1.jpg"
MIME1_TYPE=$(file -b --mime-type "${IMAGE1_PATH}")
NUM1_BYTES=$(wc -c < "${IMAGE1_PATH}")
DISPLAY_NAME1=IMAGE1
tmp_header_file1=upload-header1.tmp
curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-D upload-header1.tmp \
-H "X-Goog-Upload-Protocol: resumable" \
-H "X-Goog-Upload-Command: start" \
-H "X-Goog-Upload-Header-Content-Length: ${NUM1_BYTES}" \
-H "X-Goog-Upload-Header-Content-Type: ${MIME1_TYPE}" \
-H "Content-Type: application/json" \
-d "{'file': {'display_name': '${DISPLAY_NAME1}'}}" 2> /dev/null
upload_url1=$(grep -i "x-goog-upload-url: " "${tmp_header_file1}" | cut -d" " -f2 | tr -d "\r")
rm "${tmp_header_file1}"
curl "${upload_url1}" \
-H "Content-Length: ${NUM1_BYTES}" \
-H "X-Goog-Upload-Offset: 0" \
-H "X-Goog-Upload-Command: upload, finalize" \
--data-binary "@${IMAGE1_PATH}" 2> /dev/null > file_info1.json
file1_uri=$(jq ".file.uri" file_info1.json)
echo file1_uri=$file1_uri
# Prepare the second image (inline)
IMAGE2_PATH="path/to/image2.png"
MIME2_TYPE=$(file -b --mime-type "${IMAGE2_PATH}")
if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then
B64FLAGS="--input"
else
B64FLAGS="-w0"
fi
IMAGE2_BASE64=$(base64 $B64FLAGS $IMAGE2_PATH)
# Now generate content using both images
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[
{"text": "What is different between these two images?"},
{"file_data":{"mime_type": "'"${MIME1_TYPE}"'", "file_uri": '$file1_uri'}},
{
"inline_data": {
"mime_type":"'"${MIME2_TYPE}"'",
"data": "'"$IMAGE2_BASE64"'"
}
}
]
}]
}' 2> /dev/null > response.json
cat response.json
echo
jq ".candidates[].content.parts[].text" response.json
物件偵測
從 Gemini 2.0 開始,模型會進一步訓練,以偵測圖片中的物件並取得定界框座標。座標會根據圖片尺寸縮放至 [0, 1000]。您需要根據原始圖片大小,縮放這些座標。
Python
from google import genai
from google.genai import types
from PIL import Image
import json
client = genai.Client()
prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000."
image = Image.open("/path/to/image.png")
config = types.GenerateContentConfig(
response_mime_type="application/json"
)
response = client.models.generate_content(model="gemini-2.5-flash",
contents=[image, prompt],
config=config
)
width, height = image.size
bounding_boxes = json.loads(response.text)
converted_bounding_boxes = []
for bounding_box in bounding_boxes:
abs_y1 = int(bounding_box["box_2d"][0]/1000 * height)
abs_x1 = int(bounding_box["box_2d"][1]/1000 * width)
abs_y2 = int(bounding_box["box_2d"][2]/1000 * height)
abs_x2 = int(bounding_box["box_2d"][3]/1000 * width)
converted_bounding_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2])
print("Image size: ", width, height)
print("Bounding boxes:", converted_bounding_boxes)
如需更多範例,請參閱 Gemini 教戰手冊中的下列筆記本:
區隔
從 Gemini 2.5 開始,模型不僅能偵測項目,還能將項目區隔開來,並提供輪廓遮罩。
模型會預測 JSON 清單,其中每個項目代表一個區隔遮罩。每個項目都有定界框 (「box_2d
」),格式為 [y0, x0, y1, x1]
,其中包含介於 0 到 1000 之間的標準化座標、識別物件的標籤 (「label
」),以及定界框內的區隔遮罩 (以 Base64 編碼的 PNG,是值介於 0 到 255 之間的機率地圖)。
遮罩必須調整大小,與周框方塊的尺寸相符,然後在可信度門檻 (中點為 127) 進行二元化。
Python
from google import genai
from google.genai import types
from PIL import Image, ImageDraw
import io
import base64
import json
import numpy as np
import os
client = genai.Client()
def parse_json(json_output: str):
# Parsing out the markdown fencing
lines = json_output.splitlines()
for i, line in enumerate(lines):
if line == "```json":
json_output = "\n".join(lines[i+1:]) # Remove everything before "```json"
output = json_output.split("```")[0] # Remove everything after the closing "```"
break # Exit the loop once "```json" is found
return json_output
def extract_segmentation_masks(image_path: str, output_dir: str = "segmentation_outputs"):
# Load and resize image
im = Image.open(image_path)
im.thumbnail([1024, 1024], Image.Resampling.LANCZOS)
prompt = """
Give the segmentation masks for the wooden and glass items.
Output a JSON list of segmentation masks where each entry contains the 2D
bounding box in the key "box_2d", the segmentation mask in key "mask", and
the text label in the key "label". Use descriptive labels.
"""
config = types.GenerateContentConfig(
thinking_config=types.ThinkingConfig(thinking_budget=0) # set thinking_budget to 0 for better results in object detection
)
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[prompt, im], # Pillow images can be directly passed as inputs (which will be converted by the SDK)
config=config
)
# Parse JSON response
items = json.loads(parse_json(response.text))
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Process each mask
for i, item in enumerate(items):
# Get bounding box coordinates
box = item["box_2d"]
y0 = int(box[0] / 1000 * im.size[1])
x0 = int(box[1] / 1000 * im.size[0])
y1 = int(box[2] / 1000 * im.size[1])
x1 = int(box[3] / 1000 * im.size[0])
# Skip invalid boxes
if y0 >= y1 or x0 >= x1:
continue
# Process mask
png_str = item["mask"]
if not png_str.startswith("data:image/png;base64,"):
continue
# Remove prefix
png_str = png_str.removeprefix("data:image/png;base64,")
mask_data = base64.b64decode(png_str)
mask = Image.open(io.BytesIO(mask_data))
# Resize mask to match bounding box
mask = mask.resize((x1 - x0, y1 - y0), Image.Resampling.BILINEAR)
# Convert mask to numpy array for processing
mask_array = np.array(mask)
# Create overlay for this mask
overlay = Image.new('RGBA', im.size, (0, 0, 0, 0))
overlay_draw = ImageDraw.Draw(overlay)
# Create overlay for the mask
color = (255, 255, 255, 200)
for y in range(y0, y1):
for x in range(x0, x1):
if mask_array[y - y0, x - x0] > 128: # Threshold for mask
overlay_draw.point((x, y), fill=color)
# Save individual mask and its overlay
mask_filename = f"{item['label']}_{i}_mask.png"
overlay_filename = f"{item['label']}_{i}_overlay.png"
mask.save(os.path.join(output_dir, mask_filename))
# Create and save overlay
composite = Image.alpha_composite(im.convert('RGBA'), overlay)
composite.save(os.path.join(output_dir, overlay_filename))
print(f"Saved mask and overlay for {item['label']} to {output_dir}")
# Example usage
if __name__ == "__main__":
extract_segmentation_masks("path/to/image.png")
如需更詳細的範例,請參閱食譜指南中的區隔範例。

支援的圖片格式
Gemini 支援下列圖片格式的 MIME 類型:
- PNG -
image/png
- JPEG -
image/jpeg
- WebP -
image/webp
- HEIC -
image/heic
- HEIF -
image/heif
功能
所有 Gemini 模型版本都是多模態,可用於各種圖像處理和電腦視覺工作,包括但不限於圖像說明、圖像問題與回答、圖像分類、物件偵測和分割。
視品質和效能需求而定,Gemini 可減少使用專業機器學習模型的需求。
除了通用功能外,部分後續模型版本還經過特別訓練,可提升特定工作的準確度:
限制和重要技術資訊
檔案限制
Gemini 2.5 Pro/Flash、2.0 Flash、1.5 Pro 和 1.5 Flash 支援每個要求最多 3,600 個圖片檔案。
代幣計算
- Gemini 1.5 Flash 和 Gemini 1.5 Pro:如果兩個維度都 <= 384 像素,則為 258 個權杖。較大的圖片會以圖塊顯示 (最小圖塊 256 像素,最大 768 像素,調整大小為 768x768),每個圖塊的費用為 258 個權杖。
- Gemini 2.0 Flash 和 Gemini 2.5 Flash/Pro:如果兩個維度都 <= 384 像素,則為 258 個權杖。 較大的圖片會分割成 768x768 像素的圖塊,每個圖塊需支付 258 個權杖。
提示與最佳做法
- 確認圖片已正確旋轉。
- 使用清晰的圖片,不要模糊不清。
- 使用含有文字的單一圖片時,請將文字提示放在
contents
陣列的圖片部分之後。
後續步驟
本指南說明如何上傳圖片檔案,並從圖片輸入內容生成文字輸出內容。如要進一步瞭解相關內容,請參閱下列資源: