{"openapi":"3.1.0","info":{"title":"haiven-transcribe API","description":"Tri-engine speech-to-text service with NVIDIA Canary-1b-v2, Parakeet-TDT-0.6B-v2, and Whisper Large v3 Turbo. Provides OpenAI-compatible audio transcription and translation endpoints with automatic 3-tier failover.","version":"1.0.0","contact":{"name":"Haiven Infrastructure"},"license":{"name":"MIT","url":"https://opensource.org/licenses/MIT"}},"servers":[{"url":"http://localhost:8000","description":"Local development"},{"url":"http://haiven-transcribe:8000","description":"Internal Docker network"},{"url":"https://transcribe.haiven.site","description":"Production (Traefik reverse proxy)"},{"url":"https://ai.haiven.site","description":"AI gateway (STT routes only)"}],"tags":[{"name":"Health","description":"Service health and status checks"},{"name":"Transcription","description":"Speech-to-text conversion"},{"name":"Translation","description":"Audio translation to English"},{"name":"Models","description":"Model listing and information"}],"x-implementation-notes":{"audio-preprocessing":"All uploaded audio is automatically normalized to mono 16kHz PCM_16 WAV before being passed to any engine. Supported input: MP3, MP4, WAV, FLAC, OGG, WebM at any sample rate or channel configuration.","timestamp-support":"Only Whisper Turbo returns segment-level timestamps. Canary and Parakeet return empty segments arrays. Word-level timestamps are not implemented.","temperature-support":"The temperature parameter is accepted for API compatibility but not forwarded to any engine.","auto-model-selection":"The auto model uses fixed failover order (not language-based routing). Transcription: Canary → Parakeet → Whisper Turbo. Translation: Canary → Whisper Turbo (Parakeet skipped).","diarization":"Speaker diarization uses pyannote.audio 3.1 as a post-processing step. It runs on the same audio file after transcription, producing speaker turns that are merged with transcript segments via midpoint matching. Only available when ENABLE_DIARIZATION=true. Models loaded locally from /models/pyannote/ (no network calls). ~2GB additional VRAM."},"paths":{"/":{"get":{"summary":"Root info endpoint","operationId":"getRootInfo","tags":["Health"],"description":"Service information endpoint showing version, status, and configured models.","responses":{"200":{"description":"Service information","content":{"application/json":{"schema":{"type":"object","properties":{"service":{"type":"string","example":"haiven-transcribe"},"version":{"type":"string","example":"1.0.0"},"status":{"type":"string","example":"running"},"models":{"type":"object","properties":{"primary":{"type":"string","example":"canary"},"secondary":{"type":"string","example":"parakeet"},"tertiary":{"type":"string","example":"whisper-turbo"}}}}}}}}}}},"/health":{"get":{"summary":"Liveness probe","operationId":"getHealth","tags":["Health"],"description":"Basic liveness check for Kubernetes/Docker health monitoring.","responses":{"200":{"description":"Service is alive","content":{"application/json":{"schema":{"type":"object","properties":{"status":{"type":"string","example":"ok"}}}}}}}}},"/ready":{"get":{"summary":"Readiness probe with model loading status","operationId":"getReadiness","tags":["Health"],"description":"Readiness check showing which models are loaded and service readiness state. Returns \"ready\" if all models loaded, \"degraded\" if some models unavailable.","responses":{"200":{"description":"Service readiness status","content":{"application/json":{"schema":{"type":"object","properties":{"canary":{"type":"string","enum":["loaded","not_loaded"],"example":"loaded"},"parakeet":{"type":"string","enum":["loaded","not_loaded"],"example":"loaded"},"whisper_turbo":{"type":"string","enum":["loaded","not_loaded"],"example":"loaded"},"models_loaded":{"type":"integer","example":3,"description":"Number of models successfully loaded"},"status":{"type":"string","enum":["ready","degraded"],"example":"ready"}}}}}}}}},"/metrics":{"get":{"summary":"Prometheus metrics endpoint","operationId":"getMetrics","tags":["Health"],"description":"Prometheus-compatible metrics in exposition format.","responses":{"200":{"description":"Prometheus metrics","content":{"text/plain":{"schema":{"type":"string","example":"# HELP transcribe_requests_total Total transcription/translation requests\n# TYPE transcribe_requests_total counter\ntranscribe_requests_total{model=\"canary\",language=\"en\",status=\"success\"} 42\n"}}}}}}},"/v1/audio/transcriptions":{"post":{"summary":"Transcribe audio to text","operationId":"transcribeAudio","tags":["Transcription"],"description":"OpenAI-compatible audio transcription endpoint. Supports three transcription engines (Canary, Parakeet, Whisper Turbo) with automatic failover. The \"auto\" model (default) tries all engines in order: Canary → Parakeet → Whisper Turbo. Supports multiple audio formats and response formats.","requestBody":{"required":true,"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/TranscriptionRequest"},"examples":{"auto_json":{"summary":"Auto-select engine (JSON response)","value":{"model":"auto","response_format":"json"}},"canary_verbose":{"summary":"Canary engine with verbose output","value":{"model":"canary","language":"en","response_format":"verbose_json"}},"parakeet_text":{"summary":"Parakeet engine (text response)","value":{"model":"parakeet","response_format":"text"}},"diarized_verbose":{"summary":"Diarization with verbose output","value":{"model":"whisper-turbo","response_format":"verbose_json","diarize":true,"max_speakers":4}}}}}},"responses":{"200":{"description":"Transcription successful","content":{"application/json":{"schema":{"oneOf":[{"$ref":"#/components/schemas/TranscriptionResponseJson"},{"$ref":"#/components/schemas/TranscriptionResponseVerbose"}]},"examples":{"json_response":{"summary":"JSON response format","value":{"text":"This is the transcribed text from the audio file."}},"verbose_json_response":{"summary":"Verbose JSON response format (Whisper Turbo only returns segments)","value":{"task":"transcribe","language":"en","duration":12.5,"text":"This is the transcribed text from the audio file.","segments":[{"id":0,"start":0.0,"end":5.0,"text":"This is the transcribed text"},{"id":1,"start":5.0,"end":12.5,"text":"from the audio file."}],"engine":"whisper-turbo"}}}},"text/plain":{"schema":{"type":"string","example":"This is the transcribed text from the audio file."}}}},"400":{"$ref":"#/components/responses/BadRequest"},"500":{"description":"All transcription engines failed","content":{"application/json":{"schema":{"type":"object","properties":{"error":{"type":"string","example":"All transcription engines failed"}}}}}},"503":{"description":"Requested model not available","content":{"application/json":{"schema":{"type":"object","properties":{"error":{"type":"string","example":"Model not available"}}}}}}}}},"/v1/audio/translations":{"post":{"summary":"Translate audio to English","operationId":"translateAudio","tags":["Translation"],"description":"Transcribe and translate audio to English using Canary or Whisper Turbo. Parakeet is explicitly rejected as it is an English-only model. The \"auto\" model (default) tries: Canary → Whisper Turbo.","requestBody":{"required":true,"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/TranslationRequest"},"examples":{"auto_translate":{"summary":"Auto-select engine for translation","value":{"model":"auto","response_format":"json"}},"canary_translate":{"summary":"Canary engine translation","value":{"model":"canary","response_format":"verbose_json"}}}}}},"responses":{"200":{"description":"Translation successful","content":{"application/json":{"schema":{"oneOf":[{"$ref":"#/components/schemas/TranscriptionResponseJson"},{"$ref":"#/components/schemas/TranscriptionResponseVerbose"}]}},"text/plain":{"schema":{"type":"string"}}}},"400":{"description":"Invalid model or Parakeet requested","content":{"application/json":{"schema":{"type":"object","properties":{"error":{"type":"string","example":"Parakeet does not support translation (English-only model)"}}}}}},"500":{"$ref":"#/components/responses/ServerError"}}}},"/v1/models":{"get":{"summary":"List available transcription models","operationId":"listModels","tags":["Models"],"description":"List all available transcription models with their loading status. Compatible with OpenAI models API format.","responses":{"200":{"description":"List of available models","content":{"application/json":{"schema":{"type":"object","properties":{"object":{"type":"string","example":"list"},"data":{"type":"array","items":{"$ref":"#/components/schemas/ModelInfo"}}}},"example":{"object":"list","data":[{"id":"canary","object":"model","created":1704067200,"owned_by":"haiven-transcribe","name":"NVIDIA Canary-1b-v2","loaded":true},{"id":"parakeet","object":"model","created":1704067200,"owned_by":"haiven-transcribe","name":"NVIDIA Parakeet-TDT-0.6B-v2","loaded":true},{"id":"whisper-turbo","object":"model","created":1704067200,"owned_by":"haiven-transcribe","name":"Whisper Large v3 Turbo","loaded":true},{"id":"auto","object":"model","created":1704067200,"owned_by":"haiven-transcribe","name":"Auto (3-tier failover)","loaded":true}]}}}}}}}},"components":{"schemas":{"TranscriptionRequest":{"type":"object","required":["file"],"properties":{"file":{"type":"string","format":"binary","description":"Audio file to transcribe (mp3, mp4, wav, flac, ogg, webm)"},"model":{"type":"string","enum":["canary","parakeet","whisper-turbo","auto"],"default":"auto","description":"Transcription engine to use (auto tries all in order)"},"language":{"type":"string","description":"ISO 639-1 language code (e.g., 'en', 'es', 'fr')","example":"en"},"response_format":{"type":"string","enum":["json","text","verbose_json"],"default":"json","description":"Format of the transcription response"},"temperature":{"type":"number","minimum":0.0,"maximum":1.0,"default":0.0,"description":"Accepted for OpenAI API compatibility but not currently forwarded to any engine. All models use default inference settings regardless of this value."},"timestamp_granularities":{"type":"array","items":{"type":"string","enum":["word","segment"]},"description":"Accepted for OpenAI API compatibility but not currently implemented. This parameter has no effect on the response. Whisper Turbo always returns segment-level timestamps regardless; Canary and Parakeet return empty segments arrays. Word-level timestamps are not supported.","example":["segment"]},"diarize":{"type":"boolean","default":false,"description":"Enable speaker diarization. When true, the response includes speaker_turns and speaker labels on segments (verbose_json only). Requires pyannote.audio to be loaded (ENABLE_DIARIZATION=true)."},"min_speakers":{"type":"integer","minimum":1,"maximum":20,"description":"Minimum expected number of speakers. Providing this hint improves diarization accuracy. Only used when diarize=true."},"max_speakers":{"type":"integer","minimum":1,"maximum":20,"description":"Maximum expected number of speakers. Providing this hint improves diarization accuracy. Only used when diarize=true."}}},"TranslationRequest":{"type":"object","required":["file"],"properties":{"file":{"type":"string","format":"binary","description":"Audio file to translate"},"model":{"type":"string","enum":["canary","whisper-turbo","auto"],"default":"auto","description":"Translation engine (Parakeet not supported)"},"response_format":{"type":"string","enum":["json","text","verbose_json"],"default":"json","description":"Format of the translation response"},"temperature":{"type":"number","minimum":0.0,"maximum":1.0,"default":0.0,"description":"Accepted for OpenAI API compatibility but not currently forwarded to any engine. All models use default inference settings regardless of this value."}}},"TranscriptionResponseJson":{"type":"object","description":"Basic JSON transcription response","properties":{"text":{"type":"string","description":"Transcribed text","example":"This is the transcribed text from the audio file."}}},"TranscriptionResponseVerbose":{"type":"object","description":"Verbose JSON response with metadata and segments","properties":{"task":{"type":"string","enum":["transcribe","translate"],"example":"transcribe"},"language":{"type":"string","description":"Detected or specified language code","example":"en"},"duration":{"type":"number","description":"Audio duration in seconds","example":12.5},"text":{"type":"string","description":"Full transcribed text","example":"This is the transcribed text from the audio file."},"segments":{"type":"array","description":"Time-aligned transcript segments. Only populated by Whisper Turbo. Canary and Parakeet return an empty array.","items":{"type":"object","properties":{"id":{"type":"integer","example":0},"start":{"type":"number","description":"Segment start time (seconds)","example":0.0},"end":{"type":"number","description":"Segment end time (seconds)","example":5.0},"text":{"type":"string","example":"This is the transcribed text"},"speaker":{"type":"string","description":"Speaker identifier (only present when diarize=true)","example":"SPEAKER_00"}}}},"engine":{"type":"string","description":"Engine that performed the transcription","example":"canary"},"speaker_turns":{"type":"array","description":"Raw speaker turns from diarization. Only present when diarize=true.","items":{"$ref":"#/components/schemas/SpeakerTurn"}},"speakers_count":{"type":"integer","description":"Number of unique speakers detected. Only present when diarize=true.","example":2}}},"ModelInfo":{"type":"object","description":"Model information object","properties":{"id":{"type":"string","description":"Model identifier","example":"canary"},"object":{"type":"string","example":"model"},"created":{"type":"integer","description":"Unix timestamp of model creation","example":1704067200},"owned_by":{"type":"string","example":"haiven-transcribe"},"name":{"type":"string","description":"Human-readable model name","example":"NVIDIA Canary-1b-v2"},"loaded":{"type":"boolean","description":"Whether model is currently loaded","example":true}}},"SpeakerTurn":{"type":"object","description":"A speaker turn from diarization","properties":{"speaker":{"type":"string","description":"Speaker identifier (e.g., SPEAKER_00, SPEAKER_01)","example":"SPEAKER_00"},"start":{"type":"number","description":"Turn start time (seconds)","example":0.0},"end":{"type":"number","description":"Turn end time (seconds)","example":5.2}}}},"responses":{"BadRequest":{"description":"Bad request - invalid parameters or missing required fields","content":{"application/json":{"schema":{"type":"object","properties":{"error":{"type":"string","example":"Invalid model specified"},"message":{"type":"string","example":"Model must be one of: canary, parakeet, whisper-turbo, auto"}}}}}},"ServerError":{"description":"Server error","content":{"application/json":{"schema":{"type":"object","properties":{"error":{"type":"string","example":"Internal server error"},"message":{"type":"string","example":"Transcription engine failed"}}}}}}}}}