hjjjj c734c541b2
Some checks failed
CI / Unit tests (push) Has been cancelled
CI / commit_lint (push) Has been cancelled
feat: 添加视频URL支持及Zenmux集成
refactor: 重构Gemini适配器以支持多模态输入
fix: 修复React Hooks依赖警告
style: 清理未使用的导入和代码
docs: 更新用户界面文本和提示
perf: 优化图像和视频URL处理性能
test: 添加数据迁移工具和测试
build: 更新依赖项和.gitignore
chore: 同步Zenmux模型和价格比例
2026-03-12 17:53:27 +08:00

636 lines
19 KiB
Go

package gemini
import (
"bufio"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"github.com/songquanpeng/one-api/common/render"
"github.com/songquanpeng/one-api/common"
"github.com/songquanpeng/one-api/common/config"
"github.com/songquanpeng/one-api/common/helper"
"github.com/songquanpeng/one-api/common/image"
"github.com/songquanpeng/one-api/common/logger"
"github.com/songquanpeng/one-api/common/random"
"github.com/songquanpeng/one-api/relay/adaptor/openai"
"github.com/songquanpeng/one-api/relay/constant"
"github.com/songquanpeng/one-api/relay/model"
"github.com/gin-gonic/gin"
)
// https://ai.google.dev/docs/gemini_api_overview?hl=zh-cn
const (
VisionMaxImageNum = 16
)
var mimeTypeMap = map[string]string{
"json_object": "application/json",
"text": "text/plain",
}
// sanitizeSchema recursively removes JSON Schema keywords unsupported by Gemini
// (e.g. "const", "$schema", "additionalProperties") from a schema map.
func sanitizeSchema(v interface{}) interface{} {
switch val := v.(type) {
case map[string]interface{}:
// Only remove fields Gemini explicitly rejects; leave others intact
unsupported := []string{"const", "$schema", "additionalProperties"}
for _, key := range unsupported {
delete(val, key)
}
for k, child := range val {
val[k] = sanitizeSchema(child)
}
return val
case []interface{}:
for i, item := range val {
val[i] = sanitizeSchema(item)
}
return val
}
return v
}
// Setting safety to the lowest possible values since Gemini is already powerless enough
func ConvertRequest(textRequest model.GeneralOpenAIRequest) *ChatRequest {
geminiRequest := ChatRequest{
Contents: make([]ChatContent, 0, len(textRequest.Messages)),
SafetySettings: []ChatSafetySettings{
{
Category: "HARM_CATEGORY_HARASSMENT",
Threshold: config.GeminiSafetySetting,
},
{
Category: "HARM_CATEGORY_HATE_SPEECH",
Threshold: config.GeminiSafetySetting,
},
{
Category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
Threshold: config.GeminiSafetySetting,
},
{
Category: "HARM_CATEGORY_DANGEROUS_CONTENT",
Threshold: config.GeminiSafetySetting,
},
{
Category: "HARM_CATEGORY_CIVIC_INTEGRITY",
Threshold: config.GeminiSafetySetting,
},
},
GenerationConfig: ChatGenerationConfig{
Temperature: textRequest.Temperature,
TopP: textRequest.TopP,
MaxOutputTokens: textRequest.MaxTokens,
},
}
if textRequest.ResponseFormat != nil {
if mimeType, ok := mimeTypeMap[textRequest.ResponseFormat.Type]; ok {
geminiRequest.GenerationConfig.ResponseMimeType = mimeType
}
if textRequest.ResponseFormat.JsonSchema != nil {
geminiRequest.GenerationConfig.ResponseSchema = textRequest.ResponseFormat.JsonSchema.Schema
geminiRequest.GenerationConfig.ResponseMimeType = mimeTypeMap["json_object"]
}
}
// For models that support image generation (e.g. gemini-2.5-flash-image),
// request both TEXT and IMAGE modalities so the model returns inline images.
if strings.Contains(strings.ToLower(textRequest.Model), "image") {
geminiRequest.GenerationConfig.ResponseModalities = []string{"TEXT", "IMAGE"}
}
// Enable thinking when the client explicitly requests it via enable_thinking=true.
// Use thinkingBudget=-1 (dynamic) so Gemini decides the appropriate budget.
if textRequest.EnableThinking {
geminiRequest.GenerationConfig.ThinkingConfig = &GeminiThinkingConfig{ThinkingBudget: -1}
}
if textRequest.Tools != nil {
functions := make([]model.Function, 0, len(textRequest.Tools))
for _, tool := range textRequest.Tools {
fn := tool.Function
if fn.Parameters != nil {
fn.Parameters = sanitizeSchema(fn.Parameters)
}
functions = append(functions, fn)
}
geminiRequest.Tools = []ChatTools{
{
FunctionDeclarations: functions,
},
}
} else if textRequest.Functions != nil {
geminiRequest.Tools = []ChatTools{
{
FunctionDeclarations: textRequest.Functions,
},
}
}
// Build a map from tool_call_id → function name for resolving tool result names
toolCallIdToName := map[string]string{}
for _, message := range textRequest.Messages {
if message.Role == "assistant" {
for _, tc := range message.ToolCalls {
if tc.Id != "" && tc.Function.Name != "" {
toolCallIdToName[tc.Id] = tc.Function.Name
}
}
}
}
shouldAddDummyModelMessage := false
for _, message := range textRequest.Messages {
// --- tool result: role=tool → Gemini functionResponse (user role) ---
if message.Role == "tool" {
toolName := message.ToolCallId
if name, ok := toolCallIdToName[message.ToolCallId]; ok {
toolName = name
} else if message.Name != nil && *message.Name != "" {
toolName = *message.Name
}
if toolName == "" {
toolName = "unknown_tool"
}
geminiRequest.Contents = append(geminiRequest.Contents, ChatContent{
Role: "user",
Parts: []Part{
{
FunctionResponse: &FunctionResponse{
Name: toolName,
Response: map[string]any{"content": message.StringContent()},
},
},
},
})
continue
}
content := ChatContent{
Role: message.Role,
Parts: []Part{
{
Text: message.StringContent(),
},
},
}
openaiContent := message.ParseContent()
var parts []Part
imageNum := 0
for _, part := range openaiContent {
if part.Type == model.ContentTypeText {
parts = append(parts, Part{
Text: part.Text,
})
} else if part.Type == model.ContentTypeImageURL {
mimeType, data, _ := image.GetImageFromUrl(part.ImageURL.Url)
// Only count images toward the image limit; video/audio have no such limit
isImage := strings.HasPrefix(mimeType, "image/")
if isImage {
imageNum += 1
if imageNum > VisionMaxImageNum {
continue
}
}
parts = append(parts, Part{
InlineData: &InlineData{
MimeType: mimeType,
Data: data,
},
})
} else if part.Type == model.ContentTypeVideoURL {
mimeType, data, _ := image.GetImageFromUrl(part.VideoURL.Url)
if data != "" {
parts = append(parts, Part{
InlineData: &InlineData{
MimeType: mimeType,
Data: data,
},
})
}
} else if part.Type == model.ContentTypeInputAudio {
// input_audio: { data: "base64...", format: "mp3" }
// Convert directly to Gemini inlineData — bypasses Zenmux fileUri conversion
// that occurs when audio is embedded in image_url.
if part.InputAudio != nil && part.InputAudio.Data != "" {
mimeType := "audio/" + part.InputAudio.Format
if part.InputAudio.Format == "" {
mimeType = "audio/webm"
}
parts = append(parts, Part{
InlineData: &InlineData{
MimeType: mimeType,
Data: part.InputAudio.Data,
},
})
}
}
}
// --- assistant with tool_calls → Gemini functionCall parts ---
if message.Role == "assistant" && len(message.ToolCalls) > 0 {
var fcParts []Part
// Include any text content first
for _, p := range parts {
if p.Text != "" {
fcParts = append(fcParts, p)
}
}
for _, tc := range message.ToolCalls {
var args any
if argStr, ok := tc.Function.Arguments.(string); ok && argStr != "" {
if err := json.Unmarshal([]byte(argStr), &args); err != nil {
args = map[string]any{}
}
} else {
args = map[string]any{}
}
fcParts = append(fcParts, Part{
FunctionCall: &FunctionCall{
FunctionName: tc.Function.Name,
Arguments: args,
},
})
}
content.Role = "model"
content.Parts = fcParts
geminiRequest.Contents = append(geminiRequest.Contents, content)
continue
}
content.Parts = parts
// there's no assistant role in gemini and API shall vomit if Role is not user or model
if content.Role == "assistant" {
content.Role = "model"
}
// Converting system prompt to prompt from user for the same reason
if content.Role == "system" {
shouldAddDummyModelMessage = true
if IsModelSupportSystemInstruction(textRequest.Model) {
geminiRequest.SystemInstruction = &content
geminiRequest.SystemInstruction.Role = ""
continue
} else {
content.Role = "user"
}
}
geminiRequest.Contents = append(geminiRequest.Contents, content)
// If a system message is the last message, we need to add a dummy model message to make gemini happy
if shouldAddDummyModelMessage {
geminiRequest.Contents = append(geminiRequest.Contents, ChatContent{
Role: "model",
Parts: []Part{
{
Text: "Okay",
},
},
})
shouldAddDummyModelMessage = false
}
}
return &geminiRequest
}
func ConvertEmbeddingRequest(request model.GeneralOpenAIRequest) *BatchEmbeddingRequest {
inputs := request.ParseInput()
requests := make([]EmbeddingRequest, len(inputs))
model := fmt.Sprintf("models/%s", request.Model)
for i, input := range inputs {
requests[i] = EmbeddingRequest{
Model: model,
Content: ChatContent{
Parts: []Part{
{
Text: input,
},
},
},
}
}
return &BatchEmbeddingRequest{
Requests: requests,
}
}
type UsageMetadata struct {
PromptTokenCount int `json:"promptTokenCount"`
CandidatesTokenCount int `json:"candidatesTokenCount"`
TotalTokenCount int `json:"totalTokenCount"`
}
type ChatResponse struct {
Candidates []ChatCandidate `json:"candidates"`
PromptFeedback ChatPromptFeedback `json:"promptFeedback"`
UsageMetadata *UsageMetadata `json:"usageMetadata"`
}
func (g *ChatResponse) GetResponseText() string {
if g == nil {
return ""
}
if len(g.Candidates) > 0 && len(g.Candidates[0].Content.Parts) > 0 {
return g.Candidates[0].Content.Parts[0].Text
}
return ""
}
type ChatCandidate struct {
Content ChatContent `json:"content"`
FinishReason string `json:"finishReason"`
Index int64 `json:"index"`
SafetyRatings []ChatSafetyRating `json:"safetyRatings"`
}
type ChatSafetyRating struct {
Category string `json:"category"`
Probability string `json:"probability"`
}
type ChatPromptFeedback struct {
SafetyRatings []ChatSafetyRating `json:"safetyRatings"`
}
func getToolCalls(candidate *ChatCandidate) []model.Tool {
var toolCalls []model.Tool
item := candidate.Content.Parts[0]
if item.FunctionCall == nil {
return toolCalls
}
argsBytes, err := json.Marshal(item.FunctionCall.Arguments)
if err != nil {
logger.FatalLog("getToolCalls failed: " + err.Error())
return toolCalls
}
toolCall := model.Tool{
Id: fmt.Sprintf("call_%s", random.GetUUID()),
Type: "function",
Function: model.Function{
Arguments: string(argsBytes),
Name: item.FunctionCall.FunctionName,
},
}
toolCalls = append(toolCalls, toolCall)
return toolCalls
}
func responseGeminiChat2OpenAI(response *ChatResponse) *openai.TextResponse {
fullTextResponse := openai.TextResponse{
Id: fmt.Sprintf("chatcmpl-%s", random.GetUUID()),
Object: "chat.completion",
Created: helper.GetTimestamp(),
Choices: make([]openai.TextResponseChoice, 0, len(response.Candidates)),
}
for i, candidate := range response.Candidates {
choice := openai.TextResponseChoice{
Index: i,
Message: model.Message{
Role: "assistant",
},
FinishReason: constant.StopFinishReason,
}
if len(candidate.Content.Parts) > 0 {
if candidate.Content.Parts[0].FunctionCall != nil {
choice.Message.ToolCalls = getToolCalls(&candidate)
} else {
var builder strings.Builder
for _, part := range candidate.Content.Parts {
if i > 0 {
builder.WriteString("\n")
}
builder.WriteString(part.Text)
}
choice.Message.Content = builder.String()
}
} else {
choice.Message.Content = ""
choice.FinishReason = candidate.FinishReason
}
fullTextResponse.Choices = append(fullTextResponse.Choices, choice)
}
return &fullTextResponse
}
func streamResponseGeminiChat2OpenAI(geminiResponse *ChatResponse) *openai.ChatCompletionsStreamResponse {
var choice openai.ChatCompletionsStreamResponseChoice
if len(geminiResponse.Candidates) > 0 {
var textBuilder strings.Builder
var thinkingBuilder strings.Builder
for _, part := range geminiResponse.Candidates[0].Content.Parts {
if part.Thought {
// Thinking/reasoning content — route to reasoning_content field
thinkingBuilder.WriteString(part.Text)
} else if part.Text != "" {
textBuilder.WriteString(part.Text)
} else if part.InlineData != nil && part.InlineData.Data != "" {
// Inline image — embed as markdown data-URI so it passes through the SSE pipeline
mimeType := part.InlineData.MimeType
if mimeType == "" {
mimeType = "image/png"
}
textBuilder.WriteString(fmt.Sprintf("![generated](data:%s;base64,%s)", mimeType, part.InlineData.Data))
}
}
if textBuilder.Len() > 0 {
choice.Delta.Content = textBuilder.String()
}
if thinkingBuilder.Len() > 0 {
choice.Delta.ReasoningContent = thinkingBuilder.String()
}
}
var response openai.ChatCompletionsStreamResponse
response.Id = fmt.Sprintf("chatcmpl-%s", random.GetUUID())
response.Created = helper.GetTimestamp()
response.Object = "chat.completion.chunk"
response.Model = "gemini"
response.Choices = []openai.ChatCompletionsStreamResponseChoice{choice}
return &response
}
func embeddingResponseGemini2OpenAI(response *EmbeddingResponse) *openai.EmbeddingResponse {
openAIEmbeddingResponse := openai.EmbeddingResponse{
Object: "list",
Data: make([]openai.EmbeddingResponseItem, 0, len(response.Embeddings)),
Model: "gemini-embedding",
Usage: model.Usage{TotalTokens: 0},
}
for _, item := range response.Embeddings {
openAIEmbeddingResponse.Data = append(openAIEmbeddingResponse.Data, openai.EmbeddingResponseItem{
Object: `embedding`,
Index: 0,
Embedding: item.Values,
})
}
return &openAIEmbeddingResponse
}
func StreamHandler(c *gin.Context, resp *http.Response) (*model.ErrorWithStatusCode, *model.Usage) {
var usage *model.Usage
responseText := ""
scanner := bufio.NewScanner(resp.Body)
// Default bufio.Scanner buffer is 64KB which is too small for inline image data (base64).
// Allocate 20MB to handle large image payloads from Gemini image-generation models.
const maxScanTokenSize = 20 * 1024 * 1024
scanner.Buffer(make([]byte, maxScanTokenSize), maxScanTokenSize)
scanner.Split(bufio.ScanLines)
common.SetEventStreamHeaders(c)
for scanner.Scan() {
data := scanner.Text()
data = strings.TrimSpace(data)
if !strings.HasPrefix(data, "data: ") {
continue
}
data = strings.TrimPrefix(data, "data: ")
data = strings.TrimSuffix(data, "\"")
var geminiResponse ChatResponse
err := json.Unmarshal([]byte(data), &geminiResponse)
if err != nil {
logger.SysError("error unmarshalling stream response: " + err.Error())
continue
}
// Extract usageMetadata from the last chunk that carries it.
// This includes image/video/audio generation costs that cannot be
// estimated from text tokenisation alone.
if geminiResponse.UsageMetadata != nil {
usage = &model.Usage{
PromptTokens: geminiResponse.UsageMetadata.PromptTokenCount,
CompletionTokens: geminiResponse.UsageMetadata.CandidatesTokenCount,
TotalTokens: geminiResponse.UsageMetadata.TotalTokenCount,
}
}
response := streamResponseGeminiChat2OpenAI(&geminiResponse)
if response == nil {
continue
}
// Accumulate text for fallback token estimation (used only when
// usageMetadata is absent from the stream).
if len(geminiResponse.Candidates) > 0 {
for _, part := range geminiResponse.Candidates[0].Content.Parts {
if part.InlineData == nil {
responseText += part.Text
}
}
}
err = render.ObjectData(c, response)
if err != nil {
logger.SysError(err.Error())
}
}
if err := scanner.Err(); err != nil {
logger.SysError("error reading stream: " + err.Error())
}
render.Done(c)
err := resp.Body.Close()
if err != nil {
return openai.ErrorWrapper(err, "close_response_body_failed", http.StatusInternalServerError), nil
}
// If upstream provided usageMetadata, use it (accurate, includes image costs).
// Otherwise fall back to local tiktoken estimation on text-only content.
if usage != nil {
return nil, usage
}
return nil, openai.ResponseText2Usage(responseText, "", 0)
}
func Handler(c *gin.Context, resp *http.Response, promptTokens int, modelName string) (*model.ErrorWithStatusCode, *model.Usage) {
responseBody, err := io.ReadAll(resp.Body)
if err != nil {
return openai.ErrorWrapper(err, "read_response_body_failed", http.StatusInternalServerError), nil
}
err = resp.Body.Close()
if err != nil {
return openai.ErrorWrapper(err, "close_response_body_failed", http.StatusInternalServerError), nil
}
var geminiResponse ChatResponse
err = json.Unmarshal(responseBody, &geminiResponse)
if err != nil {
return openai.ErrorWrapper(err, "unmarshal_response_body_failed", http.StatusInternalServerError), nil
}
if len(geminiResponse.Candidates) == 0 {
return &model.ErrorWithStatusCode{
Error: model.Error{
Message: "No candidates returned",
Type: "server_error",
Param: "",
Code: 500,
},
StatusCode: resp.StatusCode,
}, nil
}
fullTextResponse := responseGeminiChat2OpenAI(&geminiResponse)
fullTextResponse.Model = modelName
completionTokens := openai.CountTokenText(geminiResponse.GetResponseText(), modelName)
usage := model.Usage{
PromptTokens: promptTokens,
CompletionTokens: completionTokens,
TotalTokens: promptTokens + completionTokens,
}
fullTextResponse.Usage = usage
jsonResponse, err := json.Marshal(fullTextResponse)
if err != nil {
return openai.ErrorWrapper(err, "marshal_response_body_failed", http.StatusInternalServerError), nil
}
c.Writer.Header().Set("Content-Type", "application/json")
c.Writer.WriteHeader(resp.StatusCode)
_, err = c.Writer.Write(jsonResponse)
return nil, &usage
}
func EmbeddingHandler(c *gin.Context, resp *http.Response) (*model.ErrorWithStatusCode, *model.Usage) {
var geminiEmbeddingResponse EmbeddingResponse
responseBody, err := io.ReadAll(resp.Body)
if err != nil {
return openai.ErrorWrapper(err, "read_response_body_failed", http.StatusInternalServerError), nil
}
err = resp.Body.Close()
if err != nil {
return openai.ErrorWrapper(err, "close_response_body_failed", http.StatusInternalServerError), nil
}
err = json.Unmarshal(responseBody, &geminiEmbeddingResponse)
if err != nil {
return openai.ErrorWrapper(err, "unmarshal_response_body_failed", http.StatusInternalServerError), nil
}
if geminiEmbeddingResponse.Error != nil {
return &model.ErrorWithStatusCode{
Error: model.Error{
Message: geminiEmbeddingResponse.Error.Message,
Type: "gemini_error",
Param: "",
Code: geminiEmbeddingResponse.Error.Code,
},
StatusCode: resp.StatusCode,
}, nil
}
fullTextResponse := embeddingResponseGemini2OpenAI(&geminiEmbeddingResponse)
jsonResponse, err := json.Marshal(fullTextResponse)
if err != nil {
return openai.ErrorWrapper(err, "marshal_response_body_failed", http.StatusInternalServerError), nil
}
c.Writer.Header().Set("Content-Type", "application/json")
c.Writer.WriteHeader(resp.StatusCode)
_, err = c.Writer.Write(jsonResponse)
return nil, &fullTextResponse.Usage
}