{"openapi":"3.0.3","info":{"title":"vllm-gemma4-e4b API","description":"Gemma 4 E4B BF16 inference via vLLM.\n\nDense model with Per-Layer Embeddings (4.5B effective / 8B total).\nOnly Gemma 4 model with audio input support.\n\nAudio specs: 16 kHz, mono, 32-bit float, max 30 seconds, ~25 tokens/sec.\nAlso supports image and video (60s) input.\n\nNo reasoning parser, no tool calling.\n","version":"1.0.0","contact":{"name":"Haiven Infrastructure"}},"servers":[{"url":"https://gemma4-e4b.haiven.site","description":"Direct access (via Traefik)"},{"url":"https://llm.haiven.site/v1","description":"Via LiteLLM gateway (recommended)"}],"paths":{"/health":{"get":{"summary":"Health check","operationId":"healthCheck","responses":{"200":{"description":"Service healthy"}}}},"/v1/models":{"get":{"summary":"List models","operationId":"listModels","responses":{"200":{"description":"Available models"}}}},"/v1/chat/completions":{"post":{"summary":"Chat completions","operationId":"chatCompletions","description":"OpenAI-compatible chat completions endpoint.\nSupports text, vision (image/video), and audio input.\nAudio must be 16 kHz mono WAV, max 30 seconds.\nModel name: gemma4-e4b\n","requestBody":{"required":true,"content":{"application/json":{"schema":{"type":"object","required":["model","messages"],"properties":{"model":{"type":"string","example":"gemma4-e4b"},"messages":{"type":"array","items":{"type":"object"}},"temperature":{"type":"number","default":1.0},"max_tokens":{"type":"integer"},"stream":{"type":"boolean","default":false}}}}}},"responses":{"200":{"description":"Chat completion response"}}}},"/metrics":{"get":{"summary":"Prometheus metrics","operationId":"getMetrics","responses":{"200":{"description":"vLLM metrics in Prometheus format"}}}}}}