{"openapi":"3.1.0","info":{"title":"LLM Gateway (llama-swap)","description":"OpenAI-compatible chat completions API with 50+ GGUF models","version":"1.0.0","contact":{"name":"Haiven Server","url":"https://haiven.site"},"license":{"name":"MIT"}},"servers":[{"url":"http://vllm-medgemma:8000","description":"Internal Docker network endpoint"},{"url":"https://medgemma.haiven.site","description":"Public HTTPS endpoint"}],"tags":[{"name":"Chat","description":"Chat completion endpoints"},{"name":"Models","description":"Model management"},{"name":"Health","description":"Service health checks"}],"paths":{"/v1/chat/completions":{"post":{"summary":"Create chat completion","description":"Generate a response from the model given a conversation.\nSupports both streaming and non-streaming responses.\n","operationId":"createChatCompletion","tags":["Chat"],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionRequest"}}}},"responses":{"200":{"description":"Successful response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletion"}},"text/event-stream":{"schema":{"$ref":"#/components/schemas/ChatCompletionChunk"}}}},"400":{"description":"Invalid request","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"404":{"description":"Model not found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"500":{"description":"Internal server error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/v1/models":{"get":{"summary":"List available models","description":"Retrieve a list of all available models loaded on the gateway","operationId":"listModels","tags":["Models"],"responses":{"200":{"description":"List of models","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ModelList"}}}},"500":{"description":"Internal server error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/health":{"get":{"summary":"Health check","description":"Check if the service is healthy and ready to serve requests","operationId":"healthCheck","tags":["Health"],"responses":{"200":{"description":"Service is healthy","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HealthResponse"}}}},"503":{"description":"Service is not healthy","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HealthResponse"}}}}}}}},"components":{"schemas":{"ChatCompletionRequest":{"type":"object","required":["model","messages"],"properties":{"model":{"type":"string","description":"ID of the model to use (e.g., qwen2.5-coder-32b)","example":"qwen2.5-coder-32b"},"messages":{"type":"array","description":"List of messages in the conversation","minItems":1,"items":{"$ref":"#/components/schemas/ChatMessage"}},"temperature":{"type":"number","description":"Controls randomness (0-2). Higher values = more random","default":0.7,"minimum":0,"maximum":2},"top_p":{"type":"number","description":"Nucleus sampling parameter (0-1)","default":0.9,"minimum":0,"maximum":1},"top_k":{"type":"integer","description":"Top-k sampling parameter","default":40,"minimum":0},"max_tokens":{"type":"integer","description":"Maximum tokens in the response","default":2048,"minimum":1},"stream":{"type":"boolean","description":"If true, returns a stream of tokens","default":false},"stop":{"oneOf":[{"type":"string"},{"type":"array","items":{"type":"string"}}],"description":"Stop sequence(s) where the model should stop generating"},"presence_penalty":{"type":"number","description":"Presence penalty (-2 to 2)","default":0,"minimum":-2,"maximum":2},"frequency_penalty":{"type":"number","description":"Frequency penalty (-2 to 2)","default":0,"minimum":-2,"maximum":2}}},"ChatMessage":{"type":"object","required":["role","content"],"properties":{"role":{"type":"string","enum":["system","user","assistant"],"description":"The role of the message author"},"content":{"type":"string","description":"The content of the message","minLength":1}}},"ChatCompletion":{"type":"object","description":"Chat completion response","properties":{"id":{"type":"string","description":"Unique ID for this response","example":"chatcmpl-123456789"},"object":{"type":"string","enum":["chat.completion"],"description":"Object type"},"created":{"type":"integer","description":"Unix timestamp of response creation","example":1677652288},"model":{"type":"string","description":"Model used","example":"qwen2.5-coder-32b"},"choices":{"type":"array","minItems":1,"items":{"$ref":"#/components/schemas/ChatChoice"}},"usage":{"$ref":"#/components/schemas/Usage"}}},"ChatChoice":{"type":"object","description":"Chat completion choice","properties":{"index":{"type":"integer","description":"Choice index"},"message":{"$ref":"#/components/schemas/ChatMessage"},"finish_reason":{"type":"string","enum":["stop","length","content_filter","tool_calls"],"description":"Reason the model stopped generating"}}},"ChatCompletionChunk":{"type":"object","description":"Streaming chat completion chunk","properties":{"id":{"type":"string","description":"Unique ID for this response"},"object":{"type":"string","enum":["chat.completion.chunk"],"description":"Object type"},"created":{"type":"integer","description":"Unix timestamp of response creation"},"model":{"type":"string","description":"Model used"},"choices":{"type":"array","items":{"type":"object","properties":{"index":{"type":"integer"},"delta":{"type":"object","properties":{"role":{"type":"string"},"content":{"type":"string"}}},"finish_reason":{"type":"string","nullable":true}}}}}},"ModelList":{"type":"object","description":"List of available models","properties":{"object":{"type":"string","enum":["list"],"description":"Object type"},"data":{"type":"array","items":{"$ref":"#/components/schemas/Model"}}}},"Model":{"type":"object","description":"Model information","properties":{"id":{"type":"string","description":"Model identifier","example":"qwen2.5-coder-32b"},"object":{"type":"string","enum":["model"],"description":"Object type"},"created":{"type":"integer","description":"Unix timestamp of model load"},"owned_by":{"type":"string","description":"Organization that owns the model","example":"haiven"},"permission":{"type":"array","items":{"type":"object"},"description":"Permission information"}}},"Usage":{"type":"object","description":"Token usage information","properties":{"prompt_tokens":{"type":"integer","description":"Number of tokens in the prompt"},"completion_tokens":{"type":"integer","description":"Number of tokens in the completion"},"total_tokens":{"type":"integer","description":"Total number of tokens used"}}},"HealthResponse":{"type":"object","description":"Service health status","properties":{"status":{"type":"string","enum":["healthy","degraded","unhealthy"],"description":"Overall health status"},"timestamp":{"type":"string","format":"date-time","description":"Health check timestamp"},"uptime_seconds":{"type":"number","description":"Service uptime in seconds"},"models_loaded":{"type":"integer","description":"Number of models currently loaded"},"gpu_memory_available":{"type":"integer","description":"Available GPU memory in bytes"},"checks":{"type":"object","properties":{"database":{"type":"string","enum":["ok","failed"]},"models":{"type":"string","enum":["ok","failed"]},"gpu":{"type":"string","enum":["ok","failed"]}}}}},"ErrorResponse":{"type":"object","description":"Error response","properties":{"error":{"type":"object","properties":{"message":{"type":"string","description":"Error message"},"type":{"type":"string","description":"Error type","example":"invalid_request_error"},"param":{"type":"string","nullable":true,"description":"Parameter that caused the error"},"code":{"type":"string","nullable":true,"description":"Error code"}}}}}}}}