{"openapi":"3.1.0","info":{"title":"haiven-reranker","description":"Cross-encoder reranking service for the Haiven knowledge pipeline. Accepts a query and a list of candidate passages, scores each (query, passage) pair using the Qwen3-Reranker-4B-seq-cls model via sentence-transformers CrossEncoder, and returns candidates sorted by relevance score.\nInternal service only — no public domain. Accessed by haiven-knowledge (port 8022) on the Docker web network at http://haiven-reranker:8460.\n","version":"1.0.0","contact":{"name":"Haiven Infrastructure"}},"servers":[{"url":"http://localhost:8460","description":"Local host (direct port mapping)"},{"url":"http://haiven-reranker:8460","description":"Docker internal (web network)"}],"tags":[{"name":"health","description":"Service health and model status"},{"name":"rerank","description":"Cross-encoder relevance reranking"}],"paths":{"/health":{"get":{"tags":["health"],"summary":"Health check","description":"Returns model load status. Always returns HTTP 200. Check the `model_loaded` field to determine if the service is ready to handle rerank requests. If `model_loaded` is false, POST /rerank will return 503.\n","operationId":"health","responses":{"200":{"description":"Health status (returned regardless of model load state)","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HealthResponse"},"examples":{"healthy":{"summary":"Model loaded and ready","value":{"status":"ok","model_loaded":true,"device":"cuda","model_name":"tomaarsen/Qwen3-Reranker-4B-seq-cls","load_error":null}},"degraded":{"summary":"Model failed to load","value":{"status":"degraded","model_loaded":false,"device":"cuda","model_name":"tomaarsen/Qwen3-Reranker-4B-seq-cls","load_error":"CUDA out of memory. Tried to allocate ..."}}}}}}}}},"/rerank":{"post":{"tags":["rerank"],"summary":"Rerank candidates","description":"Scores each candidate passage against the query using the cross-encoder model. Returns candidates sorted by relevance score descending, truncated to `top_n` results.\nCandidates are capped at `RERANKER_MAX_CANDIDATES` (default: 20). If more than 20 candidates are submitted, only the first 20 are scored and a warning is logged.\nLatency is typically 20-80ms for 5-20 candidates on the Delta GPU. Acceptance criterion: p99 < 500ms for 20 candidates.\n","operationId":"rerank","requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankRequest"},"examples":{"basic":{"summary":"Basic rerank request","value":{"query":"how does GPU memory allocation work in vLLM?","candidates":[{"id":"chunk-001","text":"vLLM uses paged attention to manage GPU KV cache memory. It pre-allocates based on gpu-memory-utilization at startup."},{"id":"chunk-002","text":"Python garbage collection frees memory when objects are no longer referenced."},{"id":"chunk-003","text":"vLLM max-num-seqs controls concurrent request slots and limits peak VRAM usage."}],"top_n":2}}}}}},"responses":{"200":{"description":"Reranked results sorted by score descending","content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankResponse"},"examples":{"success":{"summary":"Successful reranking","value":{"results":[{"id":"chunk-001","text":"vLLM uses paged attention to manage GPU KV cache memory...","score":0.9423},{"id":"chunk-003","text":"vLLM max-num-seqs controls concurrent request slots...","score":0.7891}],"latency_ms":47.3}}}}}},"422":{"description":"Validation error (missing required fields, empty query, empty candidates list)","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ValidationError"}}}},"500":{"description":"Inference failed (CUDA error, model error)","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPError"},"example":{"detail":"Reranking failed: CUDA error: device-side assert triggered"}}}},"503":{"description":"Model not loaded — service started but model failed to initialize","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPError"},"example":{"detail":"Reranker model not loaded. Error: CUDA out of memory."}}}}}}}},"components":{"schemas":{"HealthResponse":{"type":"object","required":["status","model_loaded","device","model_name"],"properties":{"status":{"type":"string","enum":["ok","degraded"],"description":"\"ok\" if the model is loaded and ready. \"degraded\" if model failed to load at startup.\n"},"model_loaded":{"type":"boolean","description":"True if CrossEncoder model is in GPU memory and ready for inference."},"device":{"type":"string","description":"Inference device in use.","example":"cuda"},"model_name":{"type":"string","description":"HuggingFace model identifier.","example":"tomaarsen/Qwen3-Reranker-4B-seq-cls"},"load_error":{"type":["string","null"],"description":"Exception message from model load failure, or null if load succeeded."}}},"Candidate":{"type":"object","required":["id","text"],"properties":{"id":{"type":"string","description":"Caller-assigned identifier for this candidate (echoed back in results)."},"text":{"type":"string","description":"Passage text to score against the query."}}},"RerankRequest":{"type":"object","required":["query","candidates"],"properties":{"query":{"type":"string","minLength":1,"description":"Search query to score candidates against."},"candidates":{"type":"array","minItems":1,"items":{"$ref":"#/components/schemas/Candidate"},"description":"List of passages to score. Maximum 20 candidates enforced server-side. Additional candidates beyond 20 are silently dropped.\n"},"top_n":{"type":"integer","default":10,"minimum":1,"description":"Number of top results to return after scoring all candidates."}}},"RankedResult":{"type":"object","required":["id","text","score"],"properties":{"id":{"type":"string","description":"Candidate ID as provided in the request."},"text":{"type":"string","description":"Candidate text as provided in the request."},"score":{"type":"number","format":"float","description":"CrossEncoder relevance score for this (query, passage) pair. Higher is more relevant. Range is approximately -10 to +10 (raw logits) depending on the model.\n"}}},"RerankResponse":{"type":"object","required":["results","latency_ms"],"properties":{"results":{"type":"array","items":{"$ref":"#/components/schemas/RankedResult"},"description":"Candidates sorted by score descending, truncated to top_n."},"latency_ms":{"type":"number","format":"float","description":"Wall-clock time for CrossEncoder.predict() only, in milliseconds."}}},"HTTPError":{"type":"object","required":["detail"],"properties":{"detail":{"type":"string","description":"Human-readable error message."}}},"ValidationError":{"type":"object","required":["detail"],"properties":{"detail":{"type":"array","items":{"type":"object","properties":{"loc":{"type":"array","items":{"type":"string"}},"msg":{"type":"string"},"type":{"type":"string"}}}}}}}}}