Commit f7645511052b
Changed files (2)
docs/emacs-ollama-config.el
@@ -0,0 +1,109 @@
+;;; emacs-ollama-config.el --- Ollama integration for Emacs -*- lexical-binding: t -*-
+
+;; Copyright (C) 2025 Vincent Demeester
+;; Author: Vincent Demeester <vincent@sbr.pm>
+
+;; This file is NOT part of GNU Emacs.
+
+;;; Commentary:
+;; Configuration for integrating Ollama LLM with Emacs using gptel.
+;; Add this to your init.el after the gptel use-package declaration.
+
+;;; Code:
+
+;; Add this inside your (use-package gptel :config ...) block
+
+;; Ollama backend with metrics tracking (RECOMMENDED - default choice)
+;; Measured overhead: ~25ms (negligible - 0.5-1.25% on small models, <0.1% on larger models)
+(gptel-make-ollama "Ollama (with metrics)"
+ :host "192.168.1.23:8000" ; Exporter endpoint for Prometheus metrics
+ :stream t
+ :models '(;; Tool Calling / OpenCode Support
+ "llama3.1:8b" ; Best for tool calling
+ "mistral-nemo:latest" ; Fast tool calling
+
+ ;; Coding Models
+ "qwen2.5-coder:7b" ; Best coding performance
+ "codestral:latest" ; Large coding model (22B)
+ "qwen-opencode:latest" ; Custom OpenCode model
+
+ ;; Reasoning Models
+ "deepseek-r1:7b" ; Lightweight reasoning
+ "phi4-reasoning:latest" ; 14B reasoning
+
+ ;; Multimodal
+ "qwen2.5vl:7b" ; Vision support
+
+ ;; Quick Tasks
+ "phi3.5:3.8b")) ; Fastest model
+
+;; Direct Ollama backend (no metrics, saves ~25ms per request)
+;; Only use for benchmarking or when you explicitly don't want metrics
+(gptel-make-ollama "Ollama (direct)"
+ :host "192.168.1.23:11434" ; Direct Ollama, bypasses metrics collection
+ :stream t
+ :models '("llama3.1:8b"
+ "qwen2.5-coder:7b"
+ "phi3.5:3.8b"
+ "deepseek-r1:7b"))
+
+;; Optional: Set Ollama as default backend
+;; Uncomment these lines to use Ollama by default instead of Gemini:
+;;
+;; (setq gptel-model "llama3.1:8b"
+;; gptel-backend (gptel-make-ollama "Ollama"
+;; :host "192.168.1.23:8000"
+;; :stream t
+;; :models '("llama3.1:8b"
+;; "qwen2.5-coder:7b"
+;; "phi3.5:3.8b")))
+
+;;; Usage:
+;;
+;; 1. Open a buffer and start gptel:
+;; M-x gptel
+;; or C-c a g (if you have the binding)
+;;
+;; 2. Switch backend using the transient menu:
+;; C-c C-m (gptel-menu)
+;; Then select backend and model
+;;
+;; 3. Send requests:
+;; C-c C-c (gptel-send)
+;;
+;; 4. Abort requests:
+;; C-c C-k (gptel-abort)
+;;
+;; 5. Quick selection:
+;; In any buffer, select text and call:
+;; M-x gptel-send
+;; This will send the selected region to the LLM
+
+;;; Performance Notes:
+;;
+;; - CPU-only models: First request 30-90s, subsequent 15-45s
+;; - Model stays loaded: 10 minutes after last request
+;; - Fastest model: phi3.5:3.8b (~2-5 seconds with warm model)
+;; - Best coding: qwen2.5-coder:7b (~30-60 seconds)
+;; - Exporter overhead: ~25ms (negligible - use "with metrics" backend by default)
+;; - Only use "direct" backend for synthetic benchmarks or troubleshooting
+
+;;; Recommended Models by Task:
+;;
+;; Coding:
+;; - qwen2.5-coder:7b (best performance)
+;; - codestral:latest (larger, more capable but slower)
+;;
+;; Quick queries:
+;; - phi3.5:3.8b (fastest)
+;;
+;; Reasoning:
+;; - deepseek-r1:7b (good reasoning)
+;; - phi4-reasoning:latest (better but slower)
+;;
+;; Tool calling (for agent-like behavior):
+;; - llama3.1:8b
+;; - mistral-nemo:latest
+
+(provide 'emacs-ollama-config)
+;;; emacs-ollama-config.el ends here
docs/ollama-usage-guide.md
@@ -0,0 +1,332 @@
+# Ollama Usage Guide
+
+This guide covers how to interact with the Ollama LLM service running on aomi (192.168.1.23).
+
+## Service Architecture
+
+```
+┌─────────────┐ ┌──────────────────┐ ┌─────────────┐
+│ Application │ ────────>│ Ollama Exporter │ ────────>│ Ollama │
+│ (OpenCode, │ :8000 │ (Prometheus) │ :11434 │ Service │
+│ Emacs) │ │ Metrics Proxy │ │ │
+└─────────────┘ └──────────────────┘ └─────────────┘
+ │
+ v
+ ┌────────────────┐
+ │ Prometheus │
+ │ (sakhalin) │
+ └────────────────┘
+```
+
+## Endpoints
+
+### With Metrics (Recommended) - Port 8000
+Use these URLs to have your requests tracked in Prometheus/Grafana:
+- **Native API**: `http://192.168.1.23:8000/api/generate`
+- **Native Chat**: `http://192.168.1.23:8000/api/chat`
+- **OpenAI-compatible**: `http://192.168.1.23:8000/v1/chat/completions`
+- **OpenAI completions**: `http://192.168.1.23:8000/v1/completions`
+
+### Direct Ollama (No Metrics) - Port 11434
+Use these URLs to bypass metrics collection (faster, no tracking):
+- **Native API**: `http://192.168.1.23:11434/api/generate`
+- **Native Chat**: `http://192.168.1.23:11434/api/chat`
+- **OpenAI-compatible**: `http://192.168.1.23:11434/v1/chat/completions`
+- **OpenAI completions**: `http://192.168.1.23:11434/v1/completions`
+
+### VPN URLs (From Any Machine)
+- **With metrics**: `http://ollama.sbr.pm/` or `http://llm.sbr.pm/` (via Traefik, port 443)
+- **Direct**: Not exposed via VPN (local network only)
+
+## Available Models
+
+```bash
+# List all models
+curl http://192.168.1.23:11434/api/tags
+
+# List via OpenAI-compatible API
+curl http://192.168.1.23:11434/v1/models
+```
+
+**Current models:**
+- `llama3.1:8b` - Best for tool calling (OpenCode)
+- `mistral-nemo` - Fast tool calling
+- `qwen2.5-coder:7b` - Best coding performance
+- `codestral:latest` - Large coding model (22B)
+- `deepseek-r1:7b` - Reasoning
+- `phi4-reasoning:latest` - 14B reasoning
+- `phi3.5:3.8b` - Fastest, smallest
+- `qwen2.5vl:7b` - Vision/multimodal
+
+## Usage Examples
+
+### 1. Native Ollama API (Simple)
+
+**Non-streaming request:**
+```bash
+curl http://192.168.1.23:8000/api/generate \
+ -d '{
+ "model": "phi3.5:3.8b",
+ "prompt": "Why is the sky blue?",
+ "stream": false
+ }'
+```
+
+**Streaming request:**
+```bash
+curl http://192.168.1.23:8000/api/generate \
+ -d '{
+ "model": "phi3.5:3.8b",
+ "prompt": "Write a haiku about coding",
+ "stream": true
+ }'
+```
+
+**Chat format:**
+```bash
+curl http://192.168.1.23:8000/api/chat \
+ -d '{
+ "model": "llama3.1:8b",
+ "messages": [
+ {"role": "system", "content": "You are a helpful coding assistant."},
+ {"role": "user", "content": "How do I reverse a string in Python?"}
+ ],
+ "stream": false
+ }'
+```
+
+### 2. OpenAI-Compatible API (For Compatibility)
+
+**Chat completions:**
+```bash
+curl http://192.168.1.23:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "llama3.1:8b",
+ "messages": [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "Explain quantum computing in simple terms"}
+ ],
+ "temperature": 0.7,
+ "max_tokens": 500
+ }'
+```
+
+**With streaming:**
+```bash
+curl http://192.168.1.23:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "qwen2.5-coder:7b",
+ "messages": [
+ {"role": "user", "content": "Write a bubble sort in Rust"}
+ ],
+ "stream": true
+ }'
+```
+
+### 3. Direct Ollama (Bypass Metrics)
+
+Use port 11434 instead of 8000 for direct access:
+
+```bash
+# Same as above, but faster (no metrics overhead)
+curl http://192.168.1.23:11434/api/generate \
+ -d '{
+ "model": "phi3.5:3.8b",
+ "prompt": "Quick test",
+ "stream": false
+ }'
+```
+
+### 4. Advanced Options
+
+**Control generation parameters:**
+```bash
+curl http://192.168.1.23:8000/api/generate \
+ -d '{
+ "model": "llama3.1:8b",
+ "prompt": "Write a function to calculate fibonacci",
+ "stream": false,
+ "options": {
+ "temperature": 0.7,
+ "top_p": 0.9,
+ "top_k": 40,
+ "num_ctx": 8192,
+ "num_predict": 1024
+ }
+ }'
+```
+
+**System prompt + context:**
+```bash
+curl http://192.168.1.23:8000/api/generate \
+ -d '{
+ "model": "qwen2.5-coder:7b",
+ "prompt": "Add error handling to this function",
+ "system": "You are an expert code reviewer focusing on robustness and error handling.",
+ "context": "This is a production application handling financial transactions",
+ "stream": false
+ }'
+```
+
+## Programming Language Examples
+
+### Python
+
+```python
+import requests
+import json
+
+def ask_ollama(prompt, model="phi3.5:3.8b", stream=False):
+ """Query Ollama with metrics tracking."""
+ url = "http://192.168.1.23:8000/api/generate"
+ payload = {
+ "model": model,
+ "prompt": prompt,
+ "stream": stream
+ }
+
+ response = requests.post(url, json=payload)
+
+ if stream:
+ for line in response.iter_lines():
+ if line:
+ chunk = json.loads(line)
+ print(chunk.get("response", ""), end="", flush=True)
+ else:
+ return response.json()["response"]
+
+# Usage
+result = ask_ollama("What is Python?")
+print(result)
+```
+
+### Bash/Shell Script
+
+```bash
+#!/bin/bash
+# ollama-query.sh - Query Ollama from shell
+
+MODEL="${1:-phi3.5:3.8b}"
+PROMPT="${2:-Hello}"
+
+curl -s http://192.168.1.23:8000/api/generate \
+ -d "{\"model\":\"$MODEL\",\"prompt\":\"$PROMPT\",\"stream\":false}" \
+ | jq -r '.response'
+```
+
+### Emacs Lisp (see Emacs Configuration section below)
+
+## Proxy Overhead Benchmarks
+
+**Measured overhead of exporter proxy: ~25ms average**
+
+Test results (5 runs with phi3.5:3.8b):
+```
+Run 1: 17ms overhead
+Run 2: 19ms overhead
+Run 3: 36ms overhead
+Run 4: 36ms overhead
+Run 5: 15ms overhead
+Average: 25ms (0.025s)
+```
+
+**Impact on typical requests:**
+- Small model (2-5s): 0.5-1.25% overhead
+- Medium model (30-60s): 0.04-0.08% overhead
+- Large model (60-120s): 0.02-0.04% overhead
+
+**Conclusion: The overhead is negligible for all practical purposes.**
+
+## When to Use Direct vs Metrics
+
+### Use Metrics Endpoint (Port 8000) - RECOMMENDED
+- ✅ **Default choice** - overhead is negligible (~25ms)
+- ✅ You want to track usage in Grafana
+- ✅ You need to monitor performance
+- ✅ Running production workloads
+- ✅ Debugging slow responses
+- ✅ Want to see token counts and costs
+
+### Use Direct Endpoint (Port 11434) - RARE CASES ONLY
+- ✅ Running synthetic benchmarks where 25ms matters
+- ✅ Troubleshooting the exporter itself
+- ✅ Explicitly don't want metrics for privacy/compliance
+
+## Monitoring
+
+### View Metrics
+```bash
+# Prometheus metrics endpoint
+curl http://192.168.1.23:8000/metrics | grep ollama_
+
+# Query Prometheus directly
+curl -s "http://192.168.1.70:9001/api/v1/query?query=ollama_requests_total"
+```
+
+### Grafana Dashboards
+- Navigate to Grafana: `http://grafana.sbr.pm`
+- Look for "Ollama Metrics" and "Ollama Performance" dashboards
+
+## Troubleshooting
+
+### Check Service Status
+```bash
+# Ollama service
+ssh aomi.sbr.pm "systemctl status ollama.service"
+
+# Exporter service
+ssh aomi.sbr.pm "systemctl status ollama-exporter.service"
+
+# Exporter container
+ssh aomi.sbr.pm "docker ps | grep ollama"
+```
+
+### Test Connectivity
+```bash
+# Test direct Ollama
+curl http://192.168.1.23:11434/api/version
+
+# Test exporter
+curl http://192.168.1.23:8000/api/version
+
+# Test models endpoint
+curl http://192.168.1.23:11434/api/tags
+```
+
+### View Logs
+```bash
+# Ollama service logs
+ssh aomi.sbr.pm "journalctl -u ollama.service -f"
+
+# Exporter logs
+ssh aomi.sbr.pm "journalctl -u ollama-exporter.service -f"
+
+# Docker exporter logs
+ssh aomi.sbr.pm "docker logs -f ollama-exporter"
+```
+
+## Performance Notes
+
+- **CPU-only**: Models run on CPU (no GPU acceleration)
+- **First request**: 30-90 seconds (model loading)
+- **Subsequent requests**: 15-45 seconds (model cached)
+- **Model stays loaded**: 10 minutes after last request (configurable)
+- **Fastest model**: `phi3.5:3.8b` (~8 seconds response time)
+- **Best coding**: `qwen2.5-coder:7b` (~30-60 seconds)
+- **Tool calling**: `llama3.1:8b`, `mistral-nemo`
+
+## Security
+
+- **Network access**: Local network (192.168.1.0/24) and VPN (10.100.0.0/24) only
+- **Authentication**: None (trusted network)
+- **HTTPS**: Available via Traefik (`https://ollama.sbr.pm`)
+- **Firewall**: Port 11434 not exposed publicly, only 8000 for metrics
+
+## See Also
+
+- [Ollama API Documentation](https://github.com/ollama/ollama/blob/main/docs/api.md)
+- [OpenAI API Compatibility](https://github.com/ollama/ollama/blob/main/docs/openai.md)
+- Emacs configuration: See next section
+- OpenCode configuration: `~/.config/opencode/opencode.json`