{"openapi":"3.0.3","info":{"title":"F5-TTS API","description":"High-quality text-to-speech API with zero-shot voice cloning using flow matching (Diffusion Transformer).\nClone any voice from 5-15 seconds of reference audio without training.\n\n## Features\n- Zero-shot voice cloning from short audio samples\n- OpenAI-compatible speech endpoint\n- Voice management (upload, list, delete)\n- Adjustable speed, silence removal, and cross-fade settings\n\n## Quick Start\n```bash\ncurl -X POST https://f5tts.haiven.site/v1/audio/speech \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"input\": \"Hello world\", \"reference_audio\": \"rosie-perez.wav\"}' \\\n  --output speech.wav\n```\n","version":"1.0.0","contact":{"name":"Haiven Infrastructure"},"license":{"name":"MIT","url":"https://opensource.org/licenses/MIT"}},"servers":[{"url":"https://f5tts.haiven.site","description":"Internal access (Haiven network)"},{"url":"https://f5tts.haiven.site","description":"External access (Let's Encrypt SSL)"},{"url":"http://localhost:5005","description":"Direct access (development)"}],"tags":[{"name":"Speech","description":"Text-to-speech generation"},{"name":"Voices","description":"Voice management for cloning"},{"name":"Models","description":"Model information"},{"name":"Health","description":"Service health and status"}],"paths":{"/":{"get":{"tags":["Health"],"summary":"Web UI","description":"Serves the F5-TTS web interface for interactive voice synthesis","operationId":"getWebUI","responses":{"200":{"description":"HTML page","content":{"text/html":{"schema":{"type":"string"}}}}}}},"/health":{"get":{"tags":["Health"],"summary":"Health check","description":"Returns the health status of the F5-TTS service","operationId":"getHealth","responses":{"200":{"description":"Service health status","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HealthResponse"},"example":{"status":"ok","model_loaded":true,"device":"cuda","gpu_available":true}}}}}}},"/v1/audio/speech":{"post":{"tags":["Speech"],"summary":"Generate speech","description":"Generate speech from text with optional voice cloning. This endpoint is OpenAI-compatible.\n\nIf `reference_audio` is provided, the generated speech will clone that voice.\nIf `reference_text` is not provided but a `.txt` file with the same name exists\nin the reference audio directory, it will be used automatically.\n","operationId":"generateSpeech","requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/SpeechRequest"},"examples":{"simple":{"summary":"Simple TTS","value":{"input":"Hello, this is a test of F5 TTS."}},"voice_cloning":{"summary":"Voice cloning","value":{"input":"Hello, this is a test with voice cloning.","reference_audio":"rosie-perez.wav","speed":1.0}},"full_options":{"summary":"All options","value":{"input":"This demonstrates all available options for speech generation.","reference_audio":"rosie-perez.wav","reference_text":"say maybe 95% of the dancers there were not...","speed":0.95,"remove_silence":true,"cross_fade_duration":0.2}}}}}},"responses":{"200":{"description":"Generated audio file","content":{"audio/wav":{"schema":{"type":"string","format":"binary"}}}},"400":{"description":"Bad request - missing or invalid parameters","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"No input text provided"}}}},"404":{"description":"Reference audio not found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Reference audio voice.wav not found"}}}},"500":{"description":"Internal server error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/v1/voices":{"get":{"tags":["Voices"],"summary":"List voices","description":"Returns a list of all available reference audio files for voice cloning","operationId":"listVoices","responses":{"200":{"description":"List of available voices","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VoiceListResponse"},"example":{"voices":[{"id":"rosie-perez","name":"rosie-perez","path":"rosie-perez.wav","size_bytes":720078,"created_at":1766090772.702}]}}}}}},"post":{"tags":["Voices"],"summary":"Upload voice","description":"Upload a new reference audio file for voice cloning.\nThe audio will be converted to optimal format (24kHz, mono, 16-bit WAV).\n","operationId":"createVoice","requestBody":{"required":true,"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/VoiceUploadRequest"}}}},"responses":{"201":{"description":"Voice created successfully","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VoiceCreateResponse"},"example":{"success":true,"voice":{"id":"my-voice","name":"my-voice","path":"my-voice.wav","duration":12.5,"has_transcription":true}}}}},"400":{"description":"Bad request - missing audio file or invalid name","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"409":{"description":"Conflict - voice already exists","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Voice 'my-voice' already exists"}}}},"500":{"description":"Internal server error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/v1/voices/{voice_id}":{"get":{"tags":["Voices"],"summary":"Get voice details","description":"Returns detailed information about a specific voice","operationId":"getVoice","parameters":[{"$ref":"#/components/parameters/VoiceId"}],"responses":{"200":{"description":"Voice details","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VoiceDetailResponse"},"example":{"id":"rosie-perez","name":"rosie-perez","path":"rosie-perez.wav","size_bytes":720078,"duration":15.0,"created_at":1766090772.702,"transcription":"say maybe 95% of the dancers there were not..."}}}},"404":{"description":"Voice not found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}},"delete":{"tags":["Voices"],"summary":"Delete voice","description":"Delete a voice and its associated files (audio and transcription)","operationId":"deleteVoice","parameters":[{"$ref":"#/components/parameters/VoiceId"}],"responses":{"200":{"description":"Voice deleted successfully","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VoiceDeleteResponse"},"example":{"success":true,"deleted":"my-voice"}}}},"404":{"description":"Voice not found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/v1/voices/{voice_id}/audio":{"get":{"tags":["Voices"],"summary":"Download voice audio","description":"Download the reference audio file for a voice","operationId":"getVoiceAudio","parameters":[{"$ref":"#/components/parameters/VoiceId"}],"responses":{"200":{"description":"Audio file","content":{"audio/wav":{"schema":{"type":"string","format":"binary"}}}},"404":{"description":"Voice not found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}}}}},"/v1/models":{"get":{"tags":["Models"],"summary":"List models","description":"Returns information about available F5-TTS models","operationId":"listModels","responses":{"200":{"description":"List of available models","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ModelListResponse"},"example":{"models":[{"id":"f5-tts-base","name":"F5-TTS Base","description":"F5-TTS base model with 100K hours training","parameters":"335M","context_length":"variable","loaded":true}]}}}}}}}},"components":{"parameters":{"VoiceId":{"name":"voice_id","in":"path","required":true,"description":"Voice identifier (filename without extension)","schema":{"type":"string","example":"rosie-perez"}}},"schemas":{"HealthResponse":{"type":"object","properties":{"status":{"type":"string","description":"Service status","enum":["ok","initializing"],"example":"ok"},"model_loaded":{"type":"boolean","description":"Whether the F5-TTS model is loaded","example":true},"device":{"type":"string","description":"Compute device (cuda or cpu)","example":"cuda"},"gpu_available":{"type":"boolean","description":"Whether GPU is available","example":true}},"required":["status","model_loaded","device","gpu_available"]},"SpeechRequest":{"type":"object","properties":{"input":{"type":"string","description":"Text to synthesize (max ~5000 characters)","example":"Hello, this is a test of F5 TTS.","maxLength":5000},"reference_audio":{"type":"string","description":"Filename of voice sample in reference-audio directory.\nIf not provided, uses default synthesis without voice cloning.\n","example":"rosie-perez.wav"},"reference_text":{"type":"string","description":"Transcription of reference audio. Improves voice matching quality.\nIf not provided, system will look for a .txt file with same name.\n","example":"say maybe 95% of the dancers there were not..."},"speed":{"type":"number","description":"Speaking rate (0.5 = slow, 2.0 = fast)","default":1.0,"minimum":0.5,"maximum":2.0,"example":1.0},"remove_silence":{"type":"boolean","description":"Remove leading and trailing silence","default":false,"example":false},"cross_fade_duration":{"type":"number","description":"Cross-fade duration for segment blending (0.0-1.0)","default":0.15,"minimum":0.0,"maximum":1.0,"example":0.15}},"required":["input"]},"VoiceListResponse":{"type":"object","properties":{"voices":{"type":"array","items":{"$ref":"#/components/schemas/VoiceSummary"}}}},"VoiceSummary":{"type":"object","properties":{"id":{"type":"string","description":"Voice identifier","example":"rosie-perez"},"name":{"type":"string","description":"Voice display name","example":"rosie-perez"},"path":{"type":"string","description":"Filename in reference-audio directory","example":"rosie-perez.wav"},"size_bytes":{"type":"integer","description":"File size in bytes","example":720078},"created_at":{"type":"number","description":"Creation timestamp (Unix epoch)","example":1766090772.702}}},"VoiceDetailResponse":{"allOf":[{"$ref":"#/components/schemas/VoiceSummary"},{"type":"object","properties":{"duration":{"type":"number","description":"Audio duration in seconds","example":15.0},"transcription":{"type":"string","description":"Transcription of the reference audio (if available)","example":"say maybe 95% of the dancers there were not..."}}}]},"VoiceUploadRequest":{"type":"object","properties":{"audio":{"type":"string","format":"binary","description":"Audio file (WAV or MP3)"},"name":{"type":"string","description":"Voice name (uses filename if not provided)","example":"my-voice"},"text":{"type":"string","description":"Transcription of the audio (stored as .txt file)","example":"This is exactly what I said in the recording."}},"required":["audio"]},"VoiceCreateResponse":{"type":"object","properties":{"success":{"type":"boolean","example":true},"voice":{"type":"object","properties":{"id":{"type":"string","example":"my-voice"},"name":{"type":"string","example":"my-voice"},"path":{"type":"string","example":"my-voice.wav"},"duration":{"type":"number","example":12.5},"has_transcription":{"type":"boolean","example":true}}}}},"VoiceDeleteResponse":{"type":"object","properties":{"success":{"type":"boolean","example":true},"deleted":{"type":"string","description":"ID of deleted voice","example":"my-voice"}}},"ModelListResponse":{"type":"object","properties":{"models":{"type":"array","items":{"$ref":"#/components/schemas/ModelInfo"}}}},"ModelInfo":{"type":"object","properties":{"id":{"type":"string","description":"Model identifier","example":"f5-tts-base"},"name":{"type":"string","description":"Model display name","example":"F5-TTS Base"},"description":{"type":"string","description":"Model description","example":"F5-TTS base model with 100K hours training"},"parameters":{"type":"string","description":"Number of model parameters","example":"335M"},"context_length":{"type":"string","description":"Context length capability","example":"variable"},"loaded":{"type":"boolean","description":"Whether model is currently loaded","example":true}}},"ErrorResponse":{"type":"object","properties":{"error":{"type":"string","description":"Error message","example":"No input text provided"}},"required":["error"]}}}}