Commit bf7cc1246686
Changed files (1)
tools
ollama-exporter
tools/ollama-exporter/ollama_exporter.py
@@ -43,7 +43,11 @@ def extract_and_record_metrics(response_data, model):
if not isinstance(response_data, dict):
return
+ # Support both native Ollama API and OpenAI-compatible v1 API
+ # Native API has timing data at top level, v1 API might have it in different location
# https://github.com/ollama/ollama/blob/main/docs/api.md#response
+
+ # Try to extract from native Ollama format first
total_duration = response_data.get("total_duration", 0) # total time spent in nanoseconds generating the response
load_duration = response_data.get("load_duration", 0) # time spent in nanoseconds loading the model
prompt_eval_duration = response_data.get("prompt_eval_duration", 0) # time spent in nanoseconds evaluating the prompt
@@ -51,6 +55,13 @@ def extract_and_record_metrics(response_data, model):
eval_duration = response_data.get("eval_duration", 0) # time spent in nanoseconds generating the response
eval_count = response_data.get("eval_count", 0) # number of tokens in the response
+ # For v1 API, try to extract from usage field if available
+ usage = response_data.get("usage", {})
+ if usage and not prompt_eval_count:
+ prompt_eval_count = usage.get("prompt_tokens", 0)
+ if usage and not eval_count:
+ eval_count = usage.get("completion_tokens", 0)
+
if total_duration > 0:
total_duration_seconds = total_duration / 1_000_000_000
OLLAMA_TOTAL_DURATION.labels(model=model).observe(total_duration_seconds)
@@ -85,6 +96,8 @@ def metrics():
@app.post("/api/chat")
@app.post("/api/generate")
+@app.post("/v1/chat/completions")
+@app.post("/v1/completions")
async def chat_with_metrics(request: Request):
"""Handle chat and generate requests with streaming support and metrics extraction."""
body = await request.json()