{"openapi":"3.0.3","info":{"title":"vllm-gemma4-26b API","description":"Gemma 4 26B-A4B FP8 Dynamic inference via vLLM.\n\nMoE architecture (128 experts, 8 active + 1 shared per token, 25.2B total / 3.8B active).\nSupports vision (image + video), tool calling (gemma4 parser), and thinking mode.\n256K context with only 5.2 GB KV cache at full context.\n\nThinking is ON by default. Disable with: `chat_template_kwargs: {enable_thinking: false}`\n\nKnown bugs: #39392 (pad tokens), #39468 (tool call corruption).\n","version":"1.0.0","contact":{"name":"Haiven Infrastructure"}},"servers":[{"url":"https://gemma4-26b.haiven.site","description":"Direct access (via Traefik)"},{"url":"https://llm.haiven.site/v1","description":"Via LiteLLM gateway (recommended)"}],"paths":{"/health":{"get":{"summary":"Health check","operationId":"healthCheck","responses":{"200":{"description":"Service healthy"}}}},"/v1/models":{"get":{"summary":"List models","operationId":"listModels","responses":{"200":{"description":"Available models"}}}},"/v1/chat/completions":{"post":{"summary":"Chat completions","operationId":"chatCompletions","description":"OpenAI-compatible chat completions endpoint.\nSupports text, vision (image/video), tool calling, streaming, and JSON mode.\nModel name: gemma4-26b\n","requestBody":{"required":true,"content":{"application/json":{"schema":{"type":"object","required":["model","messages"],"properties":{"model":{"type":"string","example":"gemma4-26b"},"messages":{"type":"array","items":{"type":"object"}},"temperature":{"type":"number","default":1.0},"max_tokens":{"type":"integer"},"stream":{"type":"boolean","default":false},"tools":{"type":"array","description":"Function definitions for tool calling"},"response_format":{"type":"object","description":"JSON mode (type json_object)"},"chat_template_kwargs":{"type":"object","description":"Pass {enable_thinking: false} to disable thinking"}}}}}},"responses":{"200":{"description":"Chat completion response"}}}},"/metrics":{"get":{"summary":"Prometheus metrics","operationId":"getMetrics","responses":{"200":{"description":"vLLM metrics in Prometheus format"}}}}}}