diff --git a/src/api/providers/fetchers/__tests__/ollama.test.ts b/src/api/providers/fetchers/__tests__/ollama.test.ts index 59663bc4959..6fe3f090cb5 100644 --- a/src/api/providers/fetchers/__tests__/ollama.test.ts +++ b/src/api/providers/fetchers/__tests__/ollama.test.ts @@ -114,6 +114,123 @@ describe("Ollama Fetcher", () => { expect(parsedModel!.supportsImages).toBe(true) expect(parsedModel!.contextWindow).toBeGreaterThan(0) }) + + it("should detect vision via details.families when capabilities omits vision", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["gemma4", "clip"], + }, + capabilities: ["completion", "tools"], // no "vision" + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) + + it("should detect vision via model_info keys when capabilities and families lack vision indicators", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["gemma4"], + }, + model_info: { + ...ollamaModelsData["qwen3-2to16:latest"].model_info, + "gemma4_vision_encoder.block_count": 27, + "gemma4_vision_encoder.embedding_length": 1152, + }, + capabilities: ["completion", "tools"], // no "vision" + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) + + it("should detect vision via siglip family in details.families", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["gemma4", "siglip"], + }, + capabilities: ["completion", "tools"], + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) + + it("should detect vision via mmproj family in details.families", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["llama", "mmproj"], + }, + capabilities: ["completion", "tools"], + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) + + it("should detect vision via mllama family in details.families", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["llama", "mllama"], + }, + capabilities: ["completion", "tools"], + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) + + it("should not detect vision when no indicators are present", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["qwen3"], + }, + capabilities: ["completion", "tools"], + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(false) + }) + + it("should handle case-insensitive family matching for vision detection", () => { + const modelData = { + ...ollamaModelsData["qwen3-2to16:latest"], + details: { + ...ollamaModelsData["qwen3-2to16:latest"].details, + families: ["gemma4", "CLIP"], + }, + capabilities: ["completion", "tools"], + } + + const parsedModel = parseOllamaModel(modelData as any) + + expect(parsedModel).not.toBeNull() + expect(parsedModel!.supportsImages).toBe(true) + }) }) describe("getOllamaModels", () => { diff --git a/src/api/providers/fetchers/ollama.ts b/src/api/providers/fetchers/ollama.ts index ba5b1c1d5d9..45e9cc1e3b0 100644 --- a/src/api/providers/fetchers/ollama.ts +++ b/src/api/providers/fetchers/ollama.ts @@ -37,6 +37,46 @@ type OllamaModelsResponse = z.infer type OllamaModelInfoResponse = z.infer +/** + * Known vision-related family names that appear in `details.families` for + * multimodal models in Ollama. When a model's `capabilities` array omits + * "vision" (as happens with some third-party quants like unsloth), we fall + * back to checking these families. + */ +const VISION_FAMILIES = new Set(["clip", "siglip", "mmproj", "mllama"]) + +/** + * Regex patterns matched against `model_info` keys to detect a vision + * encoder even when `capabilities` and `details.families` are both silent. + */ +const VISION_MODEL_INFO_PATTERN = /vision|clip|siglip|mmproj|image_encoder/i + +/** + * Determines whether the model supports images by checking: + * 1. The authoritative `capabilities` array (preferred). + * 2. `details.families` for known vision encoder families. + * 3. `model_info` keys for vision-related architecture indicators. + */ +const detectVisionSupport = (rawModel: OllamaModelInfoResponse): boolean => { + // 1. Authoritative check + if (rawModel.capabilities?.includes("vision")) { + return true + } + + // 2. Families check + const families = rawModel.details.families + if (families?.some((f) => VISION_FAMILIES.has(f.toLowerCase()))) { + return true + } + + // 3. model_info key check + if (Object.keys(rawModel.model_info).some((k) => VISION_MODEL_INFO_PATTERN.test(k))) { + return true + } + + return false +} + export const parseOllamaModel = (rawModel: OllamaModelInfoResponse): ModelInfo | null => { const contextKey = Object.keys(rawModel.model_info).find((k) => k.includes("context_length")) const contextWindow = @@ -52,7 +92,7 @@ export const parseOllamaModel = (rawModel: OllamaModelInfoResponse): ModelInfo | description: `Family: ${rawModel.details.family}, Context: ${contextWindow}, Size: ${rawModel.details.parameter_size}`, contextWindow: contextWindow || ollamaDefaultModelInfo.contextWindow, supportsPromptCache: true, - supportsImages: rawModel.capabilities?.includes("vision"), + supportsImages: detectVisionSupport(rawModel), maxTokens: contextWindow || ollamaDefaultModelInfo.contextWindow, })