Commit 0edf092f2bc0
Changed files (14)
modules
nix-flake-updater
pkgs
systems
tools
nix-flake-update
modules/nix-flake-updater/default.nix
@@ -1,297 +0,0 @@
-{
- config,
- lib,
- pkgs,
- ...
-}:
-
-with lib;
-
-let
- cfg = config.services.nix-flake-updater;
-
- instanceOpts =
- { config, ... }:
- {
- options = {
- enable = mkEnableOption "this flake updater instance";
-
- repoPath = mkOption {
- type = types.str;
- example = "/home/user/nixos-config";
- description = "Path to the git repository containing the flake";
- };
-
- flakePath = mkOption {
- type = types.str;
- default = config.repoPath;
- example = "/home/user/nixos-config";
- description = "Path to the flake (usually same as repoPath)";
- };
-
- gitRemote = mkOption {
- type = types.str;
- default = "origin";
- description = "Git remote name to push to";
- };
-
- mainBranch = mkOption {
- type = types.str;
- default = "main";
- description = "Main branch name (for auto-merge)";
- };
-
- branchPrefix = mkOption {
- type = types.str;
- default = "flake-update-";
- description = "Prefix for update branches";
- };
-
- flakeInputs = mkOption {
- type = types.listOf types.str;
- default = [ ];
- example = [
- "chick-group"
- "chapeau-rouge"
- ];
- description = "List of specific flake inputs to update (empty = all)";
- };
-
- autoMerge = mkOption {
- type = types.bool;
- default = false;
- description = "If true, automatically merge to main branch on successful build";
- };
-
- inboxOrg = mkOption {
- type = types.str;
- default = "/home/${config.user}/desktop/org/inbox.org";
- example = "/home/user/org/inbox.org";
- description = "Path to org-mode inbox file for TODO entries on failure";
- };
-
- buildSystems = mkOption {
- type = types.listOf types.str;
- default = [ ];
- example = [
- "aomi"
- "sakhalin"
- ];
- description = "List of NixOS systems to build for verification";
- };
-
- schedule = mkOption {
- type = types.str;
- default = "weekly";
- example = "Mon *-*-* 02:00:00";
- description = "Systemd timer schedule (OnCalendar format or 'weekly'/'daily')";
- };
-
- ntfyTopic = mkOption {
- type = types.str;
- default = "nix-updates";
- description = "ntfy topic for notifications";
- };
-
- ntfyServer = mkOption {
- type = types.str;
- default = "https://ntfy.sh";
- example = "http://ntfy.sbr.pm";
- description = "ntfy server URL";
- };
-
- ntfyTokenFile = mkOption {
- type = types.nullOr types.path;
- default = null;
- description = "Path to file containing ntfy authentication token (optional)";
- };
-
- dryRun = mkOption {
- type = types.bool;
- default = false;
- description = "If true, don't push to remote (testing mode)";
- };
-
- user = mkOption {
- type = types.str;
- default = "root";
- description = "User to run the update as";
- };
-
- randomizedDelaySec = mkOption {
- type = types.int;
- default = 3600;
- description = "Random delay in seconds before starting (0-value)";
- };
-
- sshKeyFile = mkOption {
- type = types.str;
- default = "/home/${config.user}/.ssh/id_ed25519";
- example = "/home/user/.ssh/id_passage";
- description = "Path to the SSH private key for git push (must be authorized on the remote)";
- };
-
- autoFix = {
- enable = mkEnableOption "AI-powered auto-fix on build failure";
-
- command = mkOption {
- type = types.str;
- default = "pi";
- description = "Agent command to invoke (must support -p for non-interactive mode)";
- };
-
- extraArgs = mkOption {
- type = types.listOf types.str;
- default = [
- "--provider"
- "google-vertex-claude"
- "--no-session"
- "--no-themes"
- "--no-skills"
- ];
- description = "Extra arguments passed to the agent command (note: do not use --no-extensions if the provider is an extension)";
- };
-
- maxAttempts = mkOption {
- type = types.int;
- default = 3;
- description = "Maximum agent invocations per failing host before giving up";
- };
-
- envFile = mkOption {
- type = types.nullOr types.path;
- default = null;
- description = "Optional file to source before running the agent (for API keys, credentials)";
- };
-
- environment = mkOption {
- type = types.attrsOf types.str;
- default = { };
- example = {
- GOOGLE_CLOUD_PROJECT = "my-project";
- GOOGLE_CLOUD_LOCATION = "global";
- };
- description = "Environment variables to set when running the agent";
- };
- };
- };
- };
-
- mkUpdateScript =
- name: instanceCfg:
- pkgs.writeShellScript "nix-flake-update-${name}" ''
- export REPO_PATH="${instanceCfg.repoPath}"
- export FLAKE_PATH="${instanceCfg.flakePath}"
- export GIT_REMOTE="${instanceCfg.gitRemote}"
- export MAIN_BRANCH="${instanceCfg.mainBranch}"
- export BRANCH_PREFIX="${instanceCfg.branchPrefix}"
- export NTFY_TOPIC="${instanceCfg.ntfyTopic}"
- export NTFY_SERVER="${instanceCfg.ntfyServer}"
- export BUILD_SYSTEMS="${toString instanceCfg.buildSystems}"
- export DRY_RUN="${toString instanceCfg.dryRun}"
- export FLAKE_INPUTS="${toString instanceCfg.flakeInputs}"
- export AUTO_MERGE="${toString instanceCfg.autoMerge}"
- export INBOX_ORG="${instanceCfg.inboxOrg}"
- ${optionalString (
- instanceCfg.ntfyTokenFile != null
- ) ''export NTFY_TOKEN_FILE="${instanceCfg.ntfyTokenFile}"''}
- export AUTO_FIX="${toString instanceCfg.autoFix.enable}"
- export AUTO_FIX_COMMAND="${instanceCfg.autoFix.command}"
- export AUTO_FIX_EXTRA_ARGS="${concatStringsSep " " instanceCfg.autoFix.extraArgs}"
- export AUTO_FIX_MAX_ATTEMPTS="${toString instanceCfg.autoFix.maxAttempts}"
- ${optionalString (
- instanceCfg.autoFix.envFile != null
- ) ''export AUTO_FIX_ENV_FILE="${instanceCfg.autoFix.envFile}"''}
- ${concatStringsSep "\n" (
- mapAttrsToList (k: v: "export ${k}=\"${v}\"") instanceCfg.autoFix.environment
- )}
-
- # Execute the packaged update script (already has tools in PATH)
- exec ${pkgs.nix-flake-update}/bin/nix-flake-update
- '';
-
- mkService =
- name: instanceCfg:
- nameValuePair "nix-flake-updater-${name}" {
- description = "Automated Nix flake.lock updater (${name})";
-
- serviceConfig = {
- Type = "oneshot";
- User = instanceCfg.user;
- ExecStart = "${mkUpdateScript name instanceCfg}";
- Environment = ''"GIT_SSH_COMMAND=ssh -F /dev/null -o IdentitiesOnly=yes -i ${instanceCfg.sshKeyFile} -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/home/${instanceCfg.user}/.ssh/known_hosts"'';
-
- # Don't fail if update fails (e.g., no changes, build failures)
- SuccessExitStatus = "0 1";
-
- # Security hardening
- PrivateTmp = true;
- ProtectSystem = "strict";
- ProtectHome = "read-only";
- ReadWritePaths = [
- instanceCfg.repoPath
- "/var/log/nix-flake-updater"
- # Worktree location (script creates worktrees in ~/tmp)
- "/home/${instanceCfg.user}/tmp"
- # Nix cache for flake fetcher
- "/home/${instanceCfg.user}/.cache/nix"
- # Org inbox for TODOs
- (dirOf instanceCfg.inboxOrg)
- # Pi agent session/config directory (needed for auto-fix)
- "/home/${instanceCfg.user}/.pi"
- ];
- NoNewPrivileges = true;
-
- # Logging
- StandardOutput = "journal";
- StandardError = "journal";
- SyslogIdentifier = "nix-flake-updater-${name}";
- };
- };
-
- mkTimer =
- name: instanceCfg:
- nameValuePair "nix-flake-updater-${name}" {
- description = "Timer for automated Nix flake.lock updates (${name})";
- wantedBy = [ "timers.target" ];
-
- timerConfig = {
- OnCalendar = instanceCfg.schedule;
- RandomizedDelaySec = instanceCfg.randomizedDelaySec;
- Persistent = true;
- };
- };
-
-in
-{
- options.services.nix-flake-updater = mkOption {
- type = types.attrsOf (types.submodule instanceOpts);
- default = { };
- description = "Automated Nix flake.lock updater instances";
- };
-
- config = mkIf (cfg != { }) (
- let
- # Collect all unique users from enabled instances
- users = unique (
- mapAttrsToList (_: instanceCfg: instanceCfg.user) (filterAttrs (_: v: v.enable) cfg)
- );
- in
- {
- systemd.services = listToAttrs (
- mapAttrsToList (name: instanceCfg: mkService name instanceCfg) (filterAttrs (_: v: v.enable) cfg)
- );
-
- systemd.timers = listToAttrs (
- mapAttrsToList (name: instanceCfg: mkTimer name instanceCfg) (filterAttrs (_: v: v.enable) cfg)
- );
-
- # Ensure log directory exists (shared by all instances)
- # Create with permissions for all users that need access
- systemd.tmpfiles.rules = [
- "d /var/log/nix-flake-updater 0775 root users -"
- ]
- ++ map (user: "Z /var/log/nix-flake-updater - ${user} - -") users;
- }
- );
-}
modules/nix-flake-updater/README.md
@@ -1,147 +0,0 @@
-# Nix Flake Updater Module
-
-Automated NixOS module for updating `flake.lock` with build verification, notifications,
-and optional AI-powered auto-fix.
-
-## Overview
-
-This module provides automated, unattended flake.lock updates that:
-
-- Run on a configurable schedule via systemd timers
-- Verify builds across multiple systems before committing
-- Optionally use a coding agent (pi) to auto-fix build failures
-- Create git branches for review workflow
-- Send notifications via ntfy
-- Support multiple named instances (e.g., daily, biweekly)
-- Support dry-run mode for testing
-
-## Files
-
-- `default.nix` - NixOS module definition
-- `../../tools/nix-flake-update/` - Update script package (wrapped with dependencies)
-
-## Usage
-
-Import the module and configure instances:
-
-```nix
-{
- imports = [ ../../modules/nix-flake-updater ];
-
- services.nix-flake-updater = {
- # Bi-weekly full update with auto-fix
- biweekly = {
- enable = true;
- repoPath = "/home/vincent/src/home";
- buildSystems = [ "okinawa" "kyushu" "rhea" "athena" ];
- schedule = "Sun *-*-1..7,15..21 02:00:00";
- ntfyServer = "https://ntfy.sbr.pm";
- user = "vincent";
-
- autoFix = {
- enable = true;
- command = "pir";
- extraArgs = [ "--model" "claude-opus-4-6" "--no-session" "--no-extensions" "--no-themes" ];
- maxAttempts = 3;
- };
- };
-
- # Daily update for specific inputs with auto-merge
- daily = {
- enable = true;
- repoPath = "/home/vincent/src/home";
- flakeInputs = [ "chick-group" "chapeau-rouge" ];
- autoMerge = true;
- buildSystems = [ "okinawa" "kyushu" ];
- schedule = "*-*-* 04:00:00";
- user = "vincent";
- };
- };
-}
-```
-
-## Auto-Fix
-
-When `autoFix.enable = true`, build failures trigger a coding agent to attempt fixes:
-
-1. Build error stderr is captured (last 200 lines)
-2. The agent is invoked in non-interactive mode (`-p`) with the error context
-3. The agent reads AGENTS.md files in the repo for channel-awareness rules
-4. If the fix works, it's committed separately from the flake.lock update
-5. A regression check rebuilds all hosts after fixes are applied
-6. Up to `maxAttempts` retries per failing host
-
-### Agent Authentication
-
-The default agent command (`pir`) uses `passage` for API key retrieval. For headless
-systemd execution, ensure the password store is accessible without interactive auth,
-or use `autoFix.envFile` to source credentials:
-
-```nix
-autoFix = {
- enable = true;
- envFile = config.age.secrets."vertex-ai-credentials".path;
-};
-```
-
-## Manual Trigger
-
-```bash
-# Run the bi-weekly update manually
-sudo systemctl start nix-flake-updater-biweekly
-
-# View logs
-journalctl -u nix-flake-updater-biweekly -f
-
-# Check timer schedule
-systemctl list-timers 'nix-flake-updater-*'
-```
-
-## Configuration Options
-
-### Core
-- `enable` - Enable this instance
-- `repoPath` - Git repository path
-- `buildSystems` - List of NixOS systems to build for verification
-- `schedule` - Systemd OnCalendar schedule
-- `flakeInputs` - Specific inputs to update (empty = all)
-- `user` - User to run as (needs git push access)
-
-### Git
-- `gitRemote` - Remote to push to (default: `origin`)
-- `mainBranch` - Main branch name (default: `main`)
-- `branchPrefix` - Prefix for update branches
-- `autoMerge` - Auto-merge to main on success (default: `false`)
-
-### Notifications
-- `ntfyServer` / `ntfyTopic` - ntfy notification settings
-- `ntfyTokenFile` - Authentication token file
-- `inboxOrg` - Org-mode inbox for TODO entries on failure
-
-### Auto-Fix
-- `autoFix.enable` - Enable AI-powered auto-fix
-- `autoFix.command` - Agent command (default: `pir`)
-- `autoFix.extraArgs` - Extra agent CLI arguments
-- `autoFix.maxAttempts` - Max retries per host (default: `3`)
-- `autoFix.envFile` - Source file for API credentials
-
-### Other
-- `dryRun` - Don't push to remote
-- `randomizedDelaySec` - Random delay before start
-
-## Architecture
-
-The update script:
-1. Creates an isolated git worktree from main
-2. Updates flake.lock (all or specific inputs)
-3. Builds all specified systems
-4. On failure with auto-fix: invokes coding agent โ rebuilds โ regression check
-5. Commits flake.lock update + any fixes (separate commits)
-6. Pushes branch (or auto-merges to main)
-7. Sends ntfy notification with results
-8. Cleans up worktree
-
-## Documentation
-
-See:
-- `/docs/nix-flake-updater-guide.md` - Complete implementation guide
pkgs/default.nix
@@ -33,7 +33,6 @@ in
jellyfin-manage-playlist = pkgs.callPackage ../tools/jellyfin-manage-playlist { };
music-playlist-dl = pkgs.callPackage ../tools/music-playlist-dl { };
readwise-reader = pkgs.callPackage ../tools/readwise-reader { };
- nix-flake-update = pkgs.callPackage ../tools/nix-flake-update { pi = pkgs.llm-agents.pi; };
slack-archive = pkgs.callPackage ../tools/slack-archive { };
gcal-to-org = pkgs.callPackage ../tools/gcal-to-org { };
daily-plan = pkgs.callPackage ../tools/daily-plan { };
systems/foo/boot.nix
@@ -1,41 +0,0 @@
-{ pkgs, ... }:
-{
- boot = {
- # extraModprobeConfig = ''
- # options snd_hda_intel power_save=1
- # '';
-
- blacklistedKernelModules = [
- "sierra_net" # sierra wireless modules
- "cdc_mbim" # modem mobile broadband modules
- "cdc_ncm" # similar
- ];
- kernelModules = [
- "ahci" # sata controller, might not be needed
- "nvme" # required for nvme disks
- "thunderbolt" # required for thunderbolt (dock, โฆ)
- # from thinkpad x1 gen 9
- "dm-mod"
- "cryptd" # required for encryption
- "xhci_pci" # usb controller related
- "usb_storage" # usb storage related
- "sd_mod" # block device related
- "sdhci_pci" # block device related as well
- "aesni-intel" # advanced encryption for intel
- "kvm_intel"
- ];
-
- kernelParams = [
- # Kernel GPU Savings Options (NOTE i915 chipset only)
- # "i915.enable_rc6=1"
- # "i915.enable_fbc=1"
- # "i915.lvds_use_ssc=0"
- # "drm.debug=0"
- # "drm.vblankoffdelay=1"
- "kvm_intel.nested=1"
- "intel_iommu=on"
- ];
-
- kernelPackages = pkgs.linuxPackages_latest;
- };
-}
systems/foo/disks.nix
@@ -1,55 +0,0 @@
-_: {
- disko.devices = {
- disk = {
- # 512GB root/boot drive. Configured with:
- # - A FAT32 ESP partition for systemd-boot
- # - A LUKS container which contains an EXT4 filesystem
- root = {
- type = "disk";
- device = "/dev/nvme0n1";
- content = {
- type = "gpt";
- partitions = {
- ESP = {
- size = "1G";
- type = "EF00";
- content = {
- type = "filesystem";
- format = "vfat";
- mountpoint = "/boot";
- mountOptions = [ "umask=0077" ];
- };
- };
- root = {
- size = "100%";
- content = {
- # LUKS passphrase will be prompted interactively only
- type = "luks";
- name = "cryptroot";
- askPassword = true;
- settings = {
- # Make sure there is no trailing newline in keyfile if used for interactive unlock.
- # Use `echo -n "password" > /tmp/data.keyfile`
- # keyFile = "/dev/disk/by-id/usb-_USB_DISK_2.0_070D375D84327E87-0:0";
- # keyFileOffset = 30992883712;
- # keyFileSize = 4096;
- allowDiscards = true;
- };
- content = {
- type = "filesystem";
- format = "ext4";
- mountpoint = "/";
- mountOptions = [
- "noatime"
- "nodiratime"
- "discard"
- ];
- };
- };
- };
- };
- };
- };
- };
- };
-}
systems/foo/hardware.nix
@@ -1,10 +0,0 @@
-{ ... }:
-{
- imports = [
- ../common/hardware/acpid.nix
- ];
-
- hardware = {
- # opengl.extraPackages = with pkgs; [ vaapiIntel libvdpau-va-gl vaapiVdpau intel-ocl intel-media-driver ];
- };
-}
systems/okinawa/extra.nix
@@ -385,39 +385,6 @@
# System packages for LLM, gaming, and tools
environment.systemPackages = with pkgs; [
- # nix-flake-update wrapper with pre-baked config for the home repo
- # Run manually: nix-flake-update-home [--dry-run] [--no-auto-fix]
- (pkgs.writeShellScriptBin "nix-flake-update-home" ''
- export REPO_PATH="/home/vincent/src/home"
- export FLAKE_PATH="/home/vincent/src/home"
- export GIT_REMOTE="origin"
- export MAIN_BRANCH="main"
- export BRANCH_PREFIX="flake-update-"
- export NTFY_TOPIC="nix-updates"
- export NTFY_SERVER="https://ntfy.sbr.pm"
- export NTFY_TOKEN_FILE="/run/agenix/ntfy-token"
- export INBOX_ORG="/home/vincent/desktop/org/inbox.org"
- export BUILD_SYSTEMS="okinawa kyushu sakhalin carthage rhea aion athena demeter aix"
- export GIT_SSH_COMMAND="ssh -F /dev/null -o IdentitiesOnly=yes -i /home/vincent/.ssh/id_passage -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/home/vincent/.ssh/known_hosts"
-
- # AI-powered auto-fix
- export AUTO_FIX="true"
- export AUTO_FIX_COMMAND="pi"
- export AUTO_FIX_EXTRA_ARGS="--provider google-vertex-claude --no-session --no-themes --no-skills"
- export AUTO_FIX_MAX_ATTEMPTS="3"
- export GOOGLE_CLOUD_PROJECT="itpc-gcp-pnd-pe-eng-claude"
- export GOOGLE_CLOUD_LOCATION="global"
-
- # Allow overriding via CLI args
- for arg in "$@"; do
- case "$arg" in
- --dry-run) export DRY_RUN="true" ;;
- --no-auto-fix) export AUTO_FIX="false" ;;
- esac
- done
-
- exec ${pkgs.nix-flake-update}/bin/nix-flake-update
- '')
# LLM tools (same package as the service, for CLI use)
(llama-cpp.override {
vulkanSupport = true;
systems/new.nix
@@ -1,93 +0,0 @@
-{
- config,
- hostname,
- inputs,
- lib,
- stateVersion,
- ...
-}:
-{
- imports = [
- (./. + "/${hostname}/boot.nix")
- (./. + "/${hostname}/hardware.nix")
- ]
- ++ lib.optional (builtins.pathExists (./. + "/${hostname}/extra.nix")) ./${hostname}/extra.nix;
-
- nixpkgs.config.allowUnfree = true;
-
- nix = {
-
- # This will add each flake input as a registry
- # To make nix3 commands consistent with your flake
- registry = lib.mkForce (lib.mapAttrs (_: value: { flake = value; }) inputs);
-
- # This will additionally add your inputs to the system's legacy channels
- # Making legacy nix commands consistent as well, awesome!
- nixPath = lib.mkForce (
- lib.mapAttrsToList (key: value: "${key}=${value.to.path}") config.nix.registry
- );
-
- optimise = {
- automatic = true;
- dates = [
- "01:10"
- "12:10"
- ];
- };
-
- settings = {
- auto-optimise-store = true;
- experimental-features = [
- "nix-command"
- "flakes"
- ];
- sandbox = true;
- allowed-users = [
- "@wheel"
- ];
- trusted-users = [
- "root"
- "@wheel"
- ];
- # See https://nixos.org/manual/nix/stable/command-ref/conf-file#conf-use-xdg-base-directories
- use-xdg-base-directories = true;
-
- # Add some "caches" (substituters)
- substituters = [
- "https://cache.nixos.org/"
- "https://r-ryantm.cachix.org"
- "https://shortbrain.cachix.org"
- "https://vdemeester.cachix.org"
- "https://chapeau-rouge.cachix.org"
- "https://nixos-raspberrypi.cachix.org"
- ];
- trusted-public-keys = [
- "r-ryantm.cachix.org-1:gkUbLkouDAyvBdpBX0JOdIiD2/DP1ldF3Z3Y6Gqcc4c="
- "shortbrain.cachix.org-1:dqXcXzM0yXs3eo9ChmMfmob93eemwNyhTx7wCR4IjeQ="
- "chapeau-rouge.cachix.org-1:r34IG766Ez4Eeanr7Zx+egzXLE2Zgvc+XRspYZPDAn8="
- "vdemeester.cachix.org-1:eZWNOrLR9A9szeMahn9ENaoT9DB3WgOos8va+d2CU44="
- "nixos-raspberrypi.cachix.org-1:4iMO9LXa8BqhU+Rpg6LQKiGa2lsNh/j2oiYLNOQ5sPI="
- ];
- };
-
- extraOptions = ''
- connect-timeout = 20
- build-cores = 0
- keep-outputs = true
- keep-derivations = true
- builders-use-substitutes = true
- '';
-
- # On laptops at least, make the daemon and builders low priority
- # to have a responding system while building
- daemonIOSchedClass = "idle";
- daemonCPUSchedPolicy = "idle";
- };
-
- # `nix-daemon` will hit the stack limit when using `nixFlakes`.
- systemd.services.nix-daemon.serviceConfig."LimitSTACK" = "infinity";
-
- system = {
- inherit stateVersion;
- };
-}
tools/nix-flake-update/default.nix
@@ -1,48 +0,0 @@
-{
- lib,
- stdenv,
- makeWrapper,
- git,
- nix,
- jq,
- curl,
- openssh,
- pi,
-}:
-
-stdenv.mkDerivation {
- pname = "nix-flake-update";
- version = "0.2.0";
-
- src = ./.;
-
- nativeBuildInputs = [ makeWrapper ];
-
- installPhase = ''
- runHook preInstall
-
- mkdir -p $out/bin
- cp nix-flake-update.sh $out/bin/nix-flake-update
- chmod +x $out/bin/nix-flake-update
-
- wrapProgram $out/bin/nix-flake-update \
- --prefix PATH : ${
- lib.makeBinPath [
- git
- nix
- jq
- curl
- openssh
- pi
- ]
- }
-
- runHook postInstall
- '';
-
- meta = with lib; {
- description = "Automated NixOS flake.lock updater with build verification";
- license = licenses.mit;
- platforms = platforms.linux;
- };
-}
tools/nix-flake-update/nix-flake-update.sh
@@ -1,607 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Automated NixOS flake.lock updater with optional AI-powered auto-fix
-# This script updates flake.lock, builds verification systems, optionally
-# uses a coding agent to fix build failures, and pushes to remote.
-
-# Configuration from environment or defaults
-REPO_PATH="${REPO_PATH:-/home/vincent/src/home}"
-FLAKE_PATH="${FLAKE_PATH:-$REPO_PATH}"
-GIT_REMOTE="${GIT_REMOTE:-origin}"
-MAIN_BRANCH="${MAIN_BRANCH:-main}"
-BRANCH_PREFIX="${BRANCH_PREFIX:-flake-update-}"
-NTFY_TOPIC="${NTFY_TOPIC:-nix-updates}"
-NTFY_SERVER="${NTFY_SERVER:-https://ntfy.sh}"
-NTFY_TOKEN_FILE="${NTFY_TOKEN_FILE:-}"
-BUILD_SYSTEMS="${BUILD_SYSTEMS:-}"
-DRY_RUN="${DRY_RUN:-false}"
-FLAKE_INPUTS="${FLAKE_INPUTS:-}" # Space-separated list of inputs to update (empty = all)
-AUTO_MERGE="${AUTO_MERGE:-false}" # If true, merge to main on success
-INBOX_ORG="${INBOX_ORG:-$HOME/desktop/org/inbox.org}" # Path to org-mode inbox
-
-# Auto-fix configuration
-AUTO_FIX="${AUTO_FIX:-false}"
-AUTO_FIX_COMMAND="${AUTO_FIX_COMMAND:-pir}"
-AUTO_FIX_EXTRA_ARGS="${AUTO_FIX_EXTRA_ARGS:---model claude-opus-4-6 --no-session --no-extensions --no-themes}"
-AUTO_FIX_MAX_ATTEMPTS="${AUTO_FIX_MAX_ATTEMPTS:-3}"
-AUTO_FIX_ENV_FILE="${AUTO_FIX_ENV_FILE:-}"
-
-LOG_FILE="/var/log/nix-flake-updater/$(date +%Y%m%d-%H%M%S).log"
-mkdir -p "$(dirname "$LOG_FILE")"
-
-# Worktree directory for isolated work (use ~/tmp to avoid tmpfs/RAM)
-WORKTREE_DIR="$HOME/tmp/nix-flake-updater-$(date +%Y%m%d-%H%M%S)"
-mkdir -p "$HOME/tmp"
-
-# Track auto-fix state
-FIXES_APPLIED=0
-FIXED_HOSTS=()
-UNFIXED_HOSTS=()
-
-log() {
- echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
-}
-
-notify() {
- local priority="$1"
- local title="$2"
- local message="$3"
- local tags="$4"
-
- if [ -n "$NTFY_TOKEN_FILE" ] && [ -f "$NTFY_TOKEN_FILE" ]; then
- curl -s \
- -H "Authorization: Bearer $(tr -d '\n' < "$NTFY_TOKEN_FILE")" \
- -H "Title: $title" \
- -H "Priority: $priority" \
- -H "Tags: $tags" \
- -d "$message" \
- "$NTFY_SERVER/$NTFY_TOPIC" || true
- else
- curl -s \
- -H "Title: $title" \
- -H "Priority: $priority" \
- -H "Tags: $tags" \
- -d "$message" \
- "$NTFY_SERVER/$NTFY_TOPIC" || true
- fi
-}
-
-add_todo_to_inbox() {
- local title="$1"
- local details="$2"
-
- if [ -f "$INBOX_ORG" ]; then
- log "Adding TODO to $INBOX_ORG"
-
- local log_tail=""
- if [ -f "$LOG_FILE" ]; then
- log_tail=$(tail -30 "$LOG_FILE")
- fi
-
- cat >> "$INBOX_ORG" <<EOF
-* TODO $title
- SCHEDULED: <$(date '+%Y-%m-%d %a')>
- :PROPERTIES:
- :CREATED: [$(date '+%Y-%m-%d %a %H:%M')]
- :END:
-
-$details
-
-Log file: $LOG_FILE
-
-#+begin_src text
-$log_tail
-#+end_src
-EOF
- else
- log "WARNING: Inbox file not found: $INBOX_ORG"
- fi
-}
-
-cleanup() {
- local exit_code=$?
-
- # Clean up worktree if it exists
- if [ -d "$WORKTREE_DIR" ]; then
- log "Cleaning up worktree: $WORKTREE_DIR"
- cd "$REPO_PATH"
- git worktree remove --force "$WORKTREE_DIR" 2>&1 | tee -a "$LOG_FILE" || true
- [[ -n "$WORKTREE_DIR" ]] && rm -rf "$WORKTREE_DIR" || true
- fi
-
- # Clean up the update branch (must happen after worktree removal)
- if [ -n "${BRANCH_NAME:-}" ] && git show-ref --verify --quiet "refs/heads/$BRANCH_NAME" 2>/dev/null; then
- log "Cleaning up branch: $BRANCH_NAME"
- cd "$REPO_PATH"
- git branch -D "$BRANCH_NAME" 2>&1 | tee -a "$LOG_FILE" || true
- fi
-
- if [ $exit_code -ne 0 ]; then
- log "ERROR: Update process failed with exit code $exit_code"
-
- local input_desc="all inputs"
- if [ -n "$FLAKE_INPUTS" ]; then
- input_desc="inputs: $FLAKE_INPUTS"
- fi
-
- add_todo_to_inbox "Fix flake update failure" \
- "Flake update failed for $input_desc.
-Build systems: $BUILD_SYSTEMS
-Auto-merge: $AUTO_MERGE"
-
- notify "high" "โ Flake Update Failed" \
- "Build failed for $input_desc. TODO added to inbox. See logs: $LOG_FILE" \
- "warning,flake"
- fi
-}
-
-trap cleanup EXIT
-
-# Build a single host, capturing stderr. Returns 0 on success, 1 on failure.
-# Sets BUILD_ERROR with captured stderr on failure.
-build_host() {
- local system="$1"
- local error_file
- error_file=$(mktemp)
-
- log "Building system: $system"
- if nix build ".#nixosConfigurations.$system.config.system.build.toplevel" \
- --no-link \
- --print-build-logs 2> >(tee "$error_file" | tee -a "$LOG_FILE" >&2); then
- log "โ $system built successfully"
- rm -f "$error_file"
- return 0
- else
- log "โ $system build failed"
- BUILD_ERROR=$(tail -200 "$error_file")
- rm -f "$error_file"
- return 1
- fi
-}
-
-# Build all systems, returning list of failures.
-# Sets FAILED_SYSTEMS (space-separated) and PASSED_SYSTEMS.
-build_all_systems() {
- FAILED_SYSTEMS=""
- PASSED_SYSTEMS=""
-
- for system in $BUILD_SYSTEMS; do
- if build_host "$system"; then
- PASSED_SYSTEMS="$PASSED_SYSTEMS $system"
- else
- FAILED_SYSTEMS="$FAILED_SYSTEMS $system"
- fi
- done
-
- # Trim leading spaces
- FAILED_SYSTEMS="${FAILED_SYSTEMS# }"
- PASSED_SYSTEMS="${PASSED_SYSTEMS# }"
-}
-
-# Try to auto-fix a build failure using a coding agent.
-# Arguments: $1 = system name, $2 = build error text
-# Returns 0 if the agent ran (check rebuild to see if it actually fixed it).
-try_auto_fix() {
- local system="$1"
- local error_text="$2"
-
- if [ "$AUTO_FIX" != "true" ] && [ "$AUTO_FIX" != "1" ]; then
- return 1
- fi
-
- # Source env file if provided (for API keys, credentials)
- if [ -n "$AUTO_FIX_ENV_FILE" ] && [ -f "$AUTO_FIX_ENV_FILE" ]; then
- log "Sourcing auto-fix environment: $AUTO_FIX_ENV_FILE"
- # shellcheck disable=SC1090
- source "$AUTO_FIX_ENV_FILE"
- fi
-
- log "Running auto-fix agent for $system (command: $AUTO_FIX_COMMAND)"
-
- # Build the prompt
- local prompt
- prompt="The NixOS build for host '${system}' failed after a flake.lock update.
-
-Working directory: $(pwd)
-
-Build command that failed:
- nix build .#nixosConfigurations.${system}.config.system.build.toplevel
-
-Build error output (last 200 lines):
-\`\`\`
-${error_text}
-\`\`\`
-
-Fix the Nix configuration files to resolve this build error.
-
-Rules:
-- Do NOT modify flake.lock or flake.nix
-- Only edit .nix configuration files
-- Read the AGENTS.md files in the relevant directories for channel-awareness rules
-- If the fix is in a shared module (systems/common/ or home/common/), ensure it works across both nixpkgs channels
-- Prefer host-specific overrides (systems/${system}/extra.nix) over modifying shared code when possible
-- After making changes, verify with: nix build .#nixosConfigurations.${system}.config.system.build.toplevel --no-link"
-
- # Run the agent
- # shellcheck disable=SC2086
- if $AUTO_FIX_COMMAND -p $AUTO_FIX_EXTRA_ARGS "$prompt" 2>&1 | tee -a "$LOG_FILE"; then
- log "Auto-fix agent completed for $system"
- return 0
- else
- log "Auto-fix agent failed/crashed for $system"
- return 1
- fi
-}
-
-# Run the auto-fix loop for all failed systems.
-# Attempts to fix each failing host, with retries and regression checking.
-auto_fix_loop() {
- local remaining_failures="$1"
- local all_systems_list="$BUILD_SYSTEMS"
- local round=0
-
- while [ -n "$remaining_failures" ] && [ $round -lt "$AUTO_FIX_MAX_ATTEMPTS" ]; do
- round=$((round + 1))
- log "=== Auto-fix round $round/$AUTO_FIX_MAX_ATTEMPTS ==="
-
- local still_failing=""
-
- for system in $remaining_failures; do
- # Try to build first (a previous fix might have resolved this too)
- if build_host "$system"; then
- log "โ $system now builds (fixed by previous change)"
- FIXED_HOSTS+=("$system")
- continue
- fi
-
- # Run the agent
- if try_auto_fix "$system" "$BUILD_ERROR"; then
- # Check if the fix worked
- if build_host "$system"; then
- log "โ $system fixed by auto-fix agent"
- FIXES_APPLIED=$((FIXES_APPLIED + 1))
- FIXED_HOSTS+=("$system")
- else
- log "โ $system still failing after auto-fix attempt"
- still_failing="$still_failing $system"
- fi
- else
- log "โ Auto-fix agent failed to run for $system"
- still_failing="$still_failing $system"
- fi
- done
-
- remaining_failures="${still_failing# }"
-
- if [ -z "$remaining_failures" ]; then
- log "All failures resolved after $round round(s)"
- break
- fi
- done
-
- # Record unfixed hosts
- for system in $remaining_failures; do
- UNFIXED_HOSTS+=("$system")
- done
-
- # Regression check: rebuild all previously-passing systems
- if [ $FIXES_APPLIED -gt 0 ]; then
- log "=== Regression check: rebuilding all systems ==="
- local regression_failures=""
-
- for system in $all_systems_list; do
- if ! build_host "$system"; then
- # Check if this was already a known failure
- local was_fixed=false
- for fixed in "${FIXED_HOSTS[@]:-}"; do
- if [ "$system" = "$fixed" ]; then
- was_fixed=true
- break
- fi
- done
-
- if [ "$was_fixed" = true ]; then
- log "โ Regression: $system was fixed but now fails again"
- regression_failures="$regression_failures $system"
- else
- # Check if this is a new regression (was passing before)
- local is_new_regression=true
- for unfixed in "${UNFIXED_HOSTS[@]:-}"; do
- if [ "$system" = "$unfixed" ]; then
- is_new_regression=false
- break
- fi
- done
-
- if [ "$is_new_regression" = true ]; then
- log "โ Regression: $system was passing but now fails after auto-fix changes"
- regression_failures="$regression_failures $system"
- fi
- fi
- fi
- done
-
- regression_failures="${regression_failures# }"
-
- if [ -n "$regression_failures" ]; then
- log "Regressions detected: $regression_failures"
- log "Attempting to fix regressions..."
-
- # One more attempt with regression context
- for system in $regression_failures; do
- if ! build_host "$system"; then
- local regression_prompt_extra="IMPORTANT: This is a REGRESSION. Host '$system' was building successfully before auto-fix changes were made to fix other hosts. Your previous fixes broke this host. Fix it WITHOUT breaking the other hosts."
-
- if try_auto_fix "$system" "$BUILD_ERROR
-$regression_prompt_extra"; then
- if build_host "$system"; then
- log "โ Regression fixed for $system"
- FIXES_APPLIED=$((FIXES_APPLIED + 1))
- else
- log "โ Could not fix regression for $system"
- UNFIXED_HOSTS+=("$system")
- fi
- else
- UNFIXED_HOSTS+=("$system")
- fi
- fi
- done
- fi
- fi
-}
-
-# =============================================================================
-# Main script
-# =============================================================================
-
-log "Starting flake update process"
-cd "$REPO_PATH"
-
-# Fetch latest changes
-log "Fetching latest changes from $GIT_REMOTE"
-git fetch "$GIT_REMOTE"
-
-# Create update branch name
-BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d)"
-if git show-ref --verify --quiet "refs/heads/$BRANCH_NAME"; then
- log "Branch $BRANCH_NAME already exists, using unique name"
- BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d-%H%M%S)"
-fi
-
-# Create worktree from main branch (skip LFS to avoid hook failures)
-log "Creating worktree at $WORKTREE_DIR from $GIT_REMOTE/main"
-GIT_LFS_SKIP_SMUDGE=1 git worktree add "$WORKTREE_DIR" "$GIT_REMOTE/main"
-
-# Switch to worktree
-cd "$WORKTREE_DIR"
-log "Working in isolated worktree: $WORKTREE_DIR"
-
-# Create update branch in the worktree
-log "Creating update branch: $BRANCH_NAME"
-git checkout -b "$BRANCH_NAME"
-
-# Save old flake.lock for before/after comparison
-OLD_FLAKE_LOCK=$(cat flake.lock)
-
-# Update flake.lock
-log "Updating flake.lock"
-if [ -n "$FLAKE_INPUTS" ]; then
- log "Updating specific inputs: $FLAKE_INPUTS"
- for input in $FLAKE_INPUTS; do
- log "Updating input: $input"
- nix flake lock --update-input "$input" 2>&1 | tee -a "$LOG_FILE"
- done
-else
- log "Updating all inputs"
- nix flake update 2>&1 | tee -a "$LOG_FILE"
-fi
-
-# Check if there are changes
-if ! git diff --quiet flake.lock; then
- log "Changes detected in flake.lock"
-
- # Show what changed
- log "Flake input changes:"
- git diff flake.lock | grep -E '^\+.*"(narHash|rev)"' | head -20 | tee -a "$LOG_FILE"
-
- # Build all systems
- build_all_systems
-
- if [ -n "$FAILED_SYSTEMS" ]; then
- log "Build failures detected: $FAILED_SYSTEMS"
-
- if [ "$AUTO_FIX" = "true" ] || [ "$AUTO_FIX" = "1" ]; then
- # Commit flake.lock first so the agent works on a clean tree
- git add flake.lock
-
- input_desc="all inputs"
- if [ -n "$FLAKE_INPUTS" ]; then
- input_desc="$FLAKE_INPUTS"
- fi
-
- CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
- def rev_map:
- .nodes | to_entries
- | map(select(.key != "root" and .value.locked != null))
- | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
- | add // {};
- ($old | rev_map) as $o |
- ($new | rev_map) as $n |
- [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
- group_by({old: $o[.key], new: .value}) |
- map({
- names: (map(.key) | join(", ")),
- old: $o[.[0].key][0:12],
- new: .[0].value[0:12]
- }) |
- map("- \(.names): \(.old) โ \(.new)") |
- join("\n")
- ' 2>/dev/null || echo "Updated flake inputs")
-
- git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "chore(flake): update $input_desc
-
-$CHANGES"
-
- # Run auto-fix loop
- auto_fix_loop "$FAILED_SYSTEMS"
-
- # Commit any fixes the agent made
- if [ $FIXES_APPLIED -gt 0 ]; then
- git add -A
- if ! git diff --cached --quiet; then
- local_fixed_list="${FIXED_HOSTS[*]:-}"
- git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "fix(nix): auto-fix build errors
-
-Fixed hosts: ${local_fixed_list}
-Agent: ${AUTO_FIX_COMMAND}
-Attempts used: ${FIXES_APPLIED}"
- fi
- fi
-
- # Determine overall result
- if [ ${#UNFIXED_HOSTS[@]} -gt 0 ]; then
- log "Auto-fix partially succeeded. Unfixed: ${UNFIXED_HOSTS[*]}"
-
- add_todo_to_inbox "Flake update: ${#UNFIXED_HOSTS[@]} hosts still failing" \
- "Auto-fix resolved ${#FIXED_HOSTS[@]} host(s) but could not fix: ${UNFIXED_HOSTS[*]}
-Build systems: $BUILD_SYSTEMS
-Agent: $AUTO_FIX_COMMAND"
-
- # Still push the branch with partial fixes
- if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
- log "DRY RUN: Would push partial-fix branch"
- else
- git push "$GIT_REMOTE" "$BRANCH_NAME"
- fi
-
- notify "high" "โ ๏ธ Flake Updated (${#UNFIXED_HOSTS[@]} hosts still failing)" \
- "Auto-fixed: ${FIXED_HOSTS[*]:-none}. Still failing: ${UNFIXED_HOSTS[*]}. Branch: $BRANCH_NAME" \
- "warning,flake,robot"
-
- exit 1
- else
- log "All failures resolved by auto-fix"
- # Fall through to normal push/merge logic below
- fi
- else
- # No auto-fix โ original behavior
- log "Build failed, not committing changes"
-
- input_desc="all inputs"
- if [ -n "$FLAKE_INPUTS" ]; then
- input_desc="$FLAKE_INPUTS"
- fi
-
- add_todo_to_inbox "Flake update build failure" \
- "Build failed after updating $input_desc.
-Build systems tested: $BUILD_SYSTEMS
-Auto-merge: $AUTO_MERGE"
-
- notify "high" "โ Flake Update Build Failed" \
- "Builds failed for updated $input_desc. TODO added to inbox. Check logs: $LOG_FILE" \
- "x,flake,warning"
-
- exit 1
- fi
- fi
-
- # If we get here, all builds passed (either initially or after auto-fix)
-
- # Commit flake.lock if not already committed (no auto-fix path)
- if ! git diff --quiet flake.lock || ! git diff --cached --quiet flake.lock; then
- git add flake.lock
-
- input_desc="all inputs"
- if [ -n "$FLAKE_INPUTS" ]; then
- input_desc="$FLAKE_INPUTS"
- fi
-
- CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
- def rev_map:
- .nodes | to_entries
- | map(select(.key != "root" and .value.locked != null))
- | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
- | add // {};
- ($old | rev_map) as $o |
- ($new | rev_map) as $n |
- [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
- group_by({old: $o[.key], new: .value}) |
- map({
- names: (map(.key) | join(", ")),
- old: $o[.[0].key][0:12],
- new: .[0].value[0:12]
- }) |
- map("- \(.names): \(.old) โ \(.new)") |
- join("\n")
- ' 2>/dev/null || echo "Updated flake inputs")
-
- COMMIT_MSG="chore(flake): update $input_desc
-
-$CHANGES
-
-Built systems: $BUILD_SYSTEMS"
-
- git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "$COMMIT_MSG"
- fi
-
- # Determine notification details
- fix_note=""
- if [ $FIXES_APPLIED -gt 0 ]; then
- fix_note=" ($FIXES_APPLIED auto-fix(es) applied: ${FIXED_HOSTS[*]})"
- fi
-
- if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
- log "DRY RUN: Would push to $GIT_REMOTE/$BRANCH_NAME"
- notify "low" "๐งช Flake Update (Dry Run)" \
- "Branch $BRANCH_NAME created locally. All builds passed: $BUILD_SYSTEMS${fix_note}" \
- "test_tube,flake"
- elif [ "$AUTO_MERGE" = "true" ] || [ "$AUTO_MERGE" = "1" ]; then
- # Auto-merge: rebase onto main and push directly
- log "Auto-merge enabled: rebasing onto $GIT_REMOTE/$MAIN_BRANCH"
-
- git fetch "$GIT_REMOTE" "$MAIN_BRANCH"
-
- if git rebase "$GIT_REMOTE/$MAIN_BRANCH"; then
- log "Rebase successful, pushing to $GIT_REMOTE/$MAIN_BRANCH"
- git push "$GIT_REMOTE" "HEAD:$MAIN_BRANCH"
-
- notify "default" "โ
Flake Auto-Updated & Merged" \
- "Updates merged to $MAIN_BRANCH. All builds passed: $BUILD_SYSTEMS${fix_note}" \
- "white_check_mark,flake,merged"
-
- log "SUCCESS: Flake updated and merged to $MAIN_BRANCH"
- else
- log "ERROR: Rebase failed, main branch may have moved"
- git rebase --abort || true
-
- add_todo_to_inbox "Flake update rebase conflict" \
- "Auto-merge failed due to rebase conflict.
-Inputs: $input_desc
-Branch: $BRANCH_NAME (in worktree, needs manual rebase)"
-
- notify "high" "โ ๏ธ Flake Update Rebase Failed" \
- "Could not rebase onto $MAIN_BRANCH. TODO added to inbox." \
- "warning,flake,conflict"
- exit 1
- fi
- else
- # Branch mode: push to feature branch
- log "Pushing to $GIT_REMOTE/$BRANCH_NAME"
- git push "$GIT_REMOTE" "$BRANCH_NAME"
-
- notify "default" "โ
Flake Updated Successfully" \
- "Branch $BRANCH_NAME pushed. All builds passed: $BUILD_SYSTEMS${fix_note}" \
- "white_check_mark,flake"
-
- log "SUCCESS: Flake updated and pushed to $BRANCH_NAME"
- fi
-
-else
- log "No changes in flake.lock, nothing to do"
- notify "low" "โน๏ธ No Flake Updates" \
- "flake.lock is already up to date" \
- "information_source,flake"
-fi
-
-log "Flake update process complete"
tools/ollama-exporter/Dockerfile
@@ -1,20 +0,0 @@
-# Use a lightweight Python base image
-FROM python:3.11-slim
-
-# Set working directory
-WORKDIR /app
-
-# Copy required files
-COPY ollama_exporter.py .
-
-# Install dependencies
-RUN pip install fastapi uvicorn prometheus_client httpx
-
-# Expose the metrics port
-EXPOSE 8000
-
-# Define runtime environment variable for Ollama host (can be overridden)
-ENV OLLAMA_HOST="http://localhost:11434"
-
-# Start the FastAPI app
-CMD ["uvicorn", "ollama_exporter:app", "--host", "0.0.0.0", "--port", "8000"]
tools/ollama-exporter/grafana-dashboard-custom.json
@@ -1,882 +0,0 @@
-{
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": {
- "type": "grafana",
- "uid": "-- Grafana --"
- },
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "fiscalYearStartMonth": 0,
- "graphTooltip": 0,
- "id": 4,
- "links": [],
- "liveNow": false,
- "panels": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "reqps"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 0
- },
- "id": 1,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_requests_total[$__rate_interval])",
- "instant": false,
- "legendFormat": "{{model}}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Request Rate by Model",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "short"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 0
- },
- "id": 2,
- "options": {
- "colorMode": "value",
- "graphMode": "area",
- "justifyMode": "auto",
- "orientation": "auto",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "textMode": "auto",
- "wideLayout": true
- },
- "pluginVersion": "10.2.6",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "ollama_requests_total",
- "instant": false,
- "legendFormat": "{{model}}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Total Requests by Model",
- "type": "stat"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "s"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 8
- },
- "id": 3,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_response_seconds_sum[$__rate_interval]) / rate(ollama_response_seconds_count[$__rate_interval])",
- "instant": false,
- "legendFormat": "Average - {{model}}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Average Response Time",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "s"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 8
- },
- "id": 4,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_load_duration_seconds_sum[$__rate_interval]) / rate(ollama_load_duration_seconds_count[$__rate_interval])",
- "instant": false,
- "legendFormat": "Average Load Time - {{model}}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Average Model Load Time",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "tps"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 16
- },
- "id": 5,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_tokens_processed_total[$__rate_interval])",
- "instant": false,
- "legendFormat": "Input Tokens/s - {{model}}",
- "range": true,
- "refId": "A"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_tokens_generated_total[$__rate_interval])",
- "instant": false,
- "legendFormat": "Output Tokens/s - {{model}}",
- "range": true,
- "refId": "B"
- }
- ],
- "title": "Token Processing Rate",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "tps"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 16
- },
- "id": 6,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_tokens_per_second_sum[$__rate_interval]) / rate(ollama_tokens_per_second_count[$__rate_interval])",
- "instant": false,
- "legendFormat": "Average - {{model}}",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Average Generation Speed (Tokens/Second)",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 10,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "vis": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "s"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 24
- },
- "id": 7,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "mode": "single",
- "sort": "none"
- }
- },
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_prompt_eval_duration_seconds_sum[$__rate_interval]) / rate(ollama_prompt_eval_duration_seconds_count[$__rate_interval])",
- "instant": false,
- "legendFormat": "Avg Prompt Eval - {{model}}",
- "range": true,
- "refId": "A"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "rate(ollama_eval_duration_seconds_sum[$__rate_interval]) / rate(ollama_eval_duration_seconds_count[$__rate_interval])",
- "instant": false,
- "legendFormat": "Avg Response Gen - {{model}}",
- "range": true,
- "refId": "B"
- }
- ],
- "title": "Average Processing Time Breakdown",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "custom": {
- "align": "auto",
- "cellOptions": {
- "type": "auto"
- },
- "inspect": false
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": [
- {
- "matcher": {
- "id": "byName",
- "options": "Avg Response Time"
- },
- "properties": [
- {
- "id": "unit",
- "value": "s"
- }
- ]
- },
- {
- "matcher": {
- "id": "byName",
- "options": "Avg Tokens/Second"
- },
- "properties": [
- {
- "id": "unit",
- "value": "tps"
- }
- ]
- }
- ]
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 24
- },
- "id": 8,
- "options": {
- "cellHeight": "sm",
- "footer": {
- "countRows": false,
- "fields": "",
- "reducer": [
- "sum"
- ],
- "show": false
- },
- "showHeader": true
- },
- "pluginVersion": "10.2.6",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "sum by (model) (ollama_requests_total)",
- "format": "table",
- "instant": true,
- "legendFormat": "__auto",
- "range": false,
- "refId": "A"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "sum by (model) (ollama_response_seconds_sum) / sum by (model) (ollama_response_seconds_count)",
- "format": "table",
- "instant": true,
- "legendFormat": "__auto",
- "range": false,
- "refId": "B"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "editorMode": "code",
- "expr": "sum by (model) (ollama_tokens_per_second_sum) / sum by (model) (ollama_tokens_per_second_count)",
- "format": "table",
- "instant": true,
- "legendFormat": "__auto",
- "range": false,
- "refId": "C"
- }
- ],
- "title": "Model Performance Summary",
- "transformations": [
- {
- "id": "merge",
- "options": {}
- },
- {
- "id": "organize",
- "options": {
- "excludeByName": {
- "Time": true
- },
- "indexByName": {},
- "renameByName": {
- "Value #A": "Total Requests",
- "Value #B": "Avg Response Time",
- "Value #C": "Avg Tokens/Second",
- "model": "Model"
- }
- }
- }
- ],
- "type": "table"
- }
- ],
- "refresh": "5s",
- "schemaVersion": 39,
- "tags": [
- "ollama",
- "ai",
- "llm"
- ],
- "templating": {
- "list": [
- {
- "current": {
- "selected": false,
- "text": "prometheus",
- "value": "c09d1a89-4ff3-4c52-b8d5-dba793da5d90"
- },
- "hide": 0,
- "includeAll": false,
- "multi": false,
- "name": "DS_PROMETHEUS",
- "options": [],
- "query": "prometheus",
- "refresh": 1,
- "regex": "",
- "skipUrlSync": false,
- "type": "datasource"
- }
- ]
- },
- "time": {
- "from": "now-1h",
- "to": "now"
- },
- "timepicker": {},
- "timezone": "",
- "title": "Ollama Performance Monitoring",
- "uid": "ollama-dashboard",
- "version": 1,
- "weekStart": ""
-}
\ No newline at end of file
tools/ollama-exporter/grafana-dashboard.json
@@ -1,517 +0,0 @@
-{
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": {
- "type": "grafana",
- "uid": "-- Grafana --"
- },
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "editable": true,
- "fiscalYearStartMonth": 0,
- "graphTooltip": 0,
- "id": 15,
- "links": [],
- "panels": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 3,
- "w": 6,
- "x": 0,
- "y": 0
- },
- "id": 1,
- "options": {
- "colorMode": "value",
- "graphMode": "area",
- "justifyMode": "auto",
- "orientation": "auto",
- "percentChangeColorMode": "standard",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "showPercentChange": false,
- "textMode": "auto",
- "wideLayout": true
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "expr": "sum by (model) (ollama_requests_total)",
- "format": "time_series",
- "refId": "A"
- }
- ],
- "title": "Total Requests Per Model",
- "type": "stat"
- },
- {
- "datasource": {
- "type": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "custom": {
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "scaleDistribution": {
- "type": "linear"
- }
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 15,
- "w": 18,
- "x": 6,
- "y": 0
- },
- "id": 6,
- "options": {
- "calculate": false,
- "cellGap": 1,
- "color": {
- "exponent": 0.5,
- "fill": "dark-orange",
- "mode": "scheme",
- "reverse": true,
- "scale": "exponential",
- "scheme": "Oranges",
- "steps": 64
- },
- "exemplars": {
- "color": "rgba(255,0,255,0.7)"
- },
- "filterValues": {
- "le": 1e-9
- },
- "legend": {
- "show": true
- },
- "rowsFrame": {
- "layout": "auto"
- },
- "tooltip": {
- "mode": "single",
- "showColorScale": false,
- "yHistogram": false
- },
- "yAxis": {
- "axisPlacement": "left",
- "reverse": false,
- "unit": "Tokens/s"
- }
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "editorMode": "code",
- "expr": "ollama_tokens_per_second_bucket",
- "format": "heatmap",
- "hide": false,
- "instant": false,
- "legendFormat": "__auto",
- "range": true,
- "refId": "A"
- }
- ],
- "title": "Count of responses by Token/s",
- "type": "heatmap"
- },
- {
- "datasource": {
- "type": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "barWidthFactor": 0.6,
- "drawStyle": "points",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 3,
- "w": 6,
- "x": 0,
- "y": 3
- },
- "id": 2,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "hideZeros": false,
- "mode": "single",
- "sort": "none"
- }
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "expr": "histogram_quantile(0.95, sum(rate(ollama_response_seconds_bucket[5m])) by (le, model))",
- "format": "time_series",
- "refId": "A"
- }
- ],
- "title": "Response Time (Seconds)",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 3,
- "w": 6,
- "x": 0,
- "y": 6
- },
- "id": 3,
- "options": {
- "colorMode": "value",
- "graphMode": "area",
- "justifyMode": "auto",
- "orientation": "auto",
- "percentChangeColorMode": "standard",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "showPercentChange": false,
- "textMode": "auto",
- "wideLayout": true
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "expr": "sum by (model) (rate(ollama_tokens_generated_total[5m]))",
- "format": "time_series",
- "refId": "A"
- }
- ],
- "title": "Tokens Generated Per Model",
- "type": "stat"
- },
- {
- "datasource": {
- "type": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "barWidthFactor": 0.6,
- "drawStyle": "points",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 3,
- "w": 6,
- "x": 0,
- "y": 9
- },
- "id": 4,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "hideZeros": false,
- "mode": "single",
- "sort": "none"
- }
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "expr": "sum by (model) (rate(ollama_eval_total[5m]))",
- "format": "time_series",
- "refId": "A"
- }
- ],
- "title": "Evaluation Steps",
- "type": "timeseries"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "fenqsyb8rfke8c"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisBorderShow": false,
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "barWidthFactor": 0.6,
- "drawStyle": "points",
- "fillOpacity": 0,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "insertNulls": false,
- "lineInterpolation": "linear",
- "lineWidth": 1,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "auto",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green"
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- }
- },
- "overrides": []
- },
- "gridPos": {
- "h": 3,
- "w": 6,
- "x": 0,
- "y": 12
- },
- "id": 5,
- "options": {
- "legend": {
- "calcs": [],
- "displayMode": "list",
- "placement": "bottom",
- "showLegend": true
- },
- "tooltip": {
- "hideZeros": false,
- "mode": "single",
- "sort": "none"
- }
- },
- "pluginVersion": "12.0.1",
- "targets": [
- {
- "datasource": {
- "type": "prometheus"
- },
- "expr": "histogram_quantile(0.95, sum(rate(ollama_load_time_seconds_bucket[5m])) by (le, model))",
- "format": "time_series",
- "refId": "A"
- }
- ],
- "title": "Model Load Time (Seconds)",
- "type": "timeseries"
- }
- ],
- "preload": false,
- "refresh": "5s",
- "schemaVersion": 41,
- "tags": [],
- "templating": {
- "list": []
- },
- "time": {
- "from": "now-30m",
- "to": "now"
- },
- "timepicker": {},
- "timezone": "",
- "title": "Ollama Metrics Dashboard",
- "version": 2
-}
\ No newline at end of file
tools/ollama-exporter/ollama_exporter.py
@@ -1,201 +0,0 @@
-import os
-import asyncio
-import httpx
-import json
-import logging
-from fastapi import FastAPI, Request, Response
-from fastapi.responses import StreamingResponse
-
-import uvicorn
-from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
-
-# Configurable Ollama host (via env variable or defaults to localhost)
-OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
-
-logging.basicConfig()
-logger = logging.getLogger(__name__)
-LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
-logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
-
-app = FastAPI()
-
-OLLAMA_CHAT_REQUEST_COUNT = Counter("ollama_requests_total", "Total chat requests", ["model"])
-
-OLLAMA_TOTAL_DURATION = Histogram("ollama_response_seconds", "Total time spent for the response", ["model"])
-OLLAMA_LOAD_DURATION = Histogram("ollama_load_duration_seconds", "Time spent loading the model", ["model"])
-OLLAMA_PROMPT_EVAL_DURATION = Histogram("ollama_prompt_eval_duration_seconds", "Time spent evaluating prompt", ["model"])
-OLLAMA_EVAL_DURATION = Histogram("ollama_eval_duration_seconds", "Time spent generating the response", ["model"])
-
-OLLAMA_PROMPT_EVAL_COUNT = Counter("ollama_tokens_processed_total", "Number of tokens in the prompt", ["model"])
-OLLAMA_EVAL_COUNT = Counter("ollama_tokens_generated_total", "Number of tokens in the response", ["model"])
-
-OLLAMA_TOKENS_PER_SECOND = Histogram(
- "ollama_tokens_per_second",
- "Tokens generated per second",
- ["model"],
- # Use buckets with suitable ranges for tokens/s measurements
- buckets=[5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
-)
-
-
-def extract_and_record_metrics(response_data, model):
- """Extract and record metrics from Ollama response data."""
- if not isinstance(response_data, dict):
- return
-
- # Support both native Ollama API and OpenAI-compatible v1 API
- # Native API has timing data at top level, v1 API might have it in different location
- # https://github.com/ollama/ollama/blob/main/docs/api.md#response
-
- # Try to extract from native Ollama format first
- total_duration = response_data.get("total_duration", 0) # total time spent in nanoseconds generating the response
- load_duration = response_data.get("load_duration", 0) # time spent in nanoseconds loading the model
- prompt_eval_duration = response_data.get("prompt_eval_duration", 0) # time spent in nanoseconds evaluating the prompt
- prompt_eval_count = response_data.get("prompt_eval_count", 0) # number of tokens in the prompt
- eval_duration = response_data.get("eval_duration", 0) # time spent in nanoseconds generating the response
- eval_count = response_data.get("eval_count", 0) # number of tokens in the response
-
- # For v1 API, try to extract from usage field if available
- usage = response_data.get("usage", {})
- if usage and not prompt_eval_count:
- prompt_eval_count = usage.get("prompt_tokens", 0)
- if usage and not eval_count:
- eval_count = usage.get("completion_tokens", 0)
-
- if total_duration > 0:
- total_duration_seconds = total_duration / 1_000_000_000
- OLLAMA_TOTAL_DURATION.labels(model=model).observe(total_duration_seconds)
- logger.debug(f"Model: {model}, Total Duration: {total_duration_seconds:.2f} seconds")
- if load_duration > 0:
- load_duration_seconds = load_duration / 1_000_000_000
- OLLAMA_LOAD_DURATION.labels(model=model).observe(load_duration_seconds)
- logger.debug(f"Model: {model}, Load Duration: {load_duration_seconds:.2f} seconds")
- if prompt_eval_duration > 0:
- prompt_eval_time_seconds = prompt_eval_duration / 1_000_000_000
- OLLAMA_PROMPT_EVAL_DURATION.labels(model=model).observe(prompt_eval_time_seconds)
- logger.debug(f"Model: {model}, Prompt Eval Duration: {prompt_eval_time_seconds:.2f} seconds")
- if prompt_eval_count > 0:
- OLLAMA_PROMPT_EVAL_COUNT.labels(model=model).inc(prompt_eval_count)
- logger.debug(f"Model: {model}, Prompt Eval Count: {prompt_eval_count}")
- if eval_duration > 0:
- eval_duration_seconds = eval_duration / 1_000_000_000
- OLLAMA_EVAL_DURATION.labels(model=model).observe(eval_duration_seconds)
- logger.debug(f"Model: {model}, Eval Duration: {eval_duration_seconds:.2f} seconds")
- if eval_count > 0:
- OLLAMA_EVAL_COUNT.labels(model=model).inc(eval_count)
- logger.debug(f"Model: {model}, Eval Count: {eval_count}")
- if eval_duration > 0 and eval_count > 0:
- tps = eval_count / eval_duration * 1_000_000_000
- OLLAMA_TOKENS_PER_SECOND.labels(model=model).observe(tps)
- logger.debug(f"Model: {model}, Tokens per Second: {tps:.2f}")
-
-@app.get("/metrics")
-def metrics():
- """Expose Prometheus metrics."""
- return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
-
-@app.post("/api/chat")
-@app.post("/api/generate")
-@app.post("/v1/chat/completions")
-@app.post("/v1/completions")
-async def chat_with_metrics(request: Request):
- """Handle chat and generate requests with streaming support and metrics extraction."""
- body = await request.json()
- model = body.get("model", "unknown")
- # logger.debug(f"Chat request body: {json.dumps(body, indent=4)}")
- is_streaming = body.get("stream", False)
-
- headers = dict(request.headers)
- headers.pop("host", None)
- headers.pop("content-length", None)
- headers.pop("content-type", None)
-
- OLLAMA_CHAT_REQUEST_COUNT.labels(model=model).inc()
-
- if is_streaming:
- async def generate_stream():
- endpoint = request.url.path # /api/chat or /api/generate
- async with httpx.AsyncClient(timeout=httpx.Timeout(900.0, read=900.0)) as client:
- async with client.stream("POST", f"{OLLAMA_HOST}{endpoint}", headers=headers, json=body, params=request.query_params) as response:
-
- final_chunk_data = None
-
- async for chunk in response.aiter_bytes():
- # Forward the chunk immediately to the client
- yield chunk
-
- # Try to parse the chunk to look for metrics
- if chunk:
- try:
- chunk_text = chunk.decode('utf-8')
- lines = chunk_text.strip().split('\n')
-
- for line in lines:
- if line.strip():
- try:
- chunk_json = json.loads(line)
- # Check if this is the final chunk (contains "done": true)
- if chunk_json.get("done", False):
- final_chunk_data = chunk_json
- except json.JSONDecodeError:
- continue
-
- except UnicodeDecodeError:
- pass
-
- # Extract metrics from the final chunk if available
- if final_chunk_data:
- extract_and_record_metrics(final_chunk_data, model)
-
- return StreamingResponse(generate_stream(), media_type="application/json")
- else:
- endpoint = request.url.path # /api/chat or /api/generate
- async with httpx.AsyncClient(timeout=httpx.Timeout(900.0, read=900.0)) as client:
- response = await client.post(f"{OLLAMA_HOST}{endpoint}", headers=headers, json=body, params=request.query_params)
-
- if response.status_code == 200:
- try:
- response_data = response.json()
- extract_and_record_metrics(response_data, model)
- except (json.JSONDecodeError, TypeError):
- pass
-
- return Response(content=response.content, status_code=response.status_code, headers=dict(response.headers))
-
-@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"])
-async def simple_proxy(request: Request, path: str):
- """Simple pass-through proxy for all other endpoints."""
- logger.debug(f"Proxying {request.method} request to /{path}")
- headers = dict(request.headers)
- headers.pop("host", None)
- headers.pop("content-length", None)
-
- async with httpx.AsyncClient(timeout=httpx.Timeout(900.0, read=900.0)) as client:
- response = await client.request(method=request.method, url=f"{OLLAMA_HOST}/{path}", headers=headers, content=await request.body(), params=request.query_params)
-
- logger.debug(f"Proxy response: {response.status_code} for {request.method} /{path}")
- return Response(content=response.content, status_code=response.status_code, headers=dict(response.headers))
-
-async def verify_ollama_connection():
- """Verify connection to Ollama server at startup."""
- logger.debug(f"Verifying connection to Ollama server at {OLLAMA_HOST}")
-
- try:
- async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
- response = await client.get(f"{OLLAMA_HOST}/api/version")
- if response.status_code == 200:
- logger.info("Connected to Ollama")
- else:
- logger.error(f"Failed to connect to Ollama server. Status code: {response.status_code}")
- except Exception as e:
- logger.error(f"Failed to connect to Ollama server at {OLLAMA_HOST}: {e}")
- logger.error("Please ensure Ollama is running and accessible at the configured host")
-
-async def main():
- await verify_ollama_connection()
- config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
- server = uvicorn.Server(config)
- await server.serve()
-
-if __name__ == "__main__":
- asyncio.run(main())