Commit f375c16395d8

Vincent Demeester <vincent@sbr.pm>
2026-03-24 09:58:36
feat: update SearXNG and add Playwright search fallback
Pinned SearXNG to 2c1ce3bd37a2 (2026-03-23) for Google useragent bypass (#5892) and Bing engine rework (#5793). Added Playwright-based browser search backends (Bing, Mojeek, Ecosia, Brave) to the pi search extension as fallback when SearXNG or ddgr fail. Uses headless system Chrome via a Python script with auto-bootstrapped venv. Bing URL deobfuscation extracts real URLs from redirects.
1 parent c642b64
Changed files (4)
dots
overlays
dots/pi/agent/extensions/search/backends.ts
@@ -150,6 +150,118 @@ export class DdgrBackend implements SearchBackend {
   }
 }
 
+/**
+ * Playwright browser search backend - uses headless Chrome via a Python script.
+ * Fallback when API-based backends fail due to rate limiting or bot detection.
+ * Supports Bing (reliable, no CAPTCHA) and Brave Search (rate-limited after a few queries).
+ *
+ * Requirements:
+ * - Python 3 with playwright package (auto-bootstrapped in ~/.local/share/pi/playwright-env)
+ * - System Chrome (google-chrome-stable) or Playwright's bundled Chromium
+ */
+export class PlaywrightBackend implements SearchBackend {
+  name: string;
+  private engine: string;
+  private execFn: (cmd: string, args: string[], opts?: any) => Promise<{ stdout: string; stderr: string; code: number }>;
+  private scriptPath: string;
+  private venvPath: string;
+
+  constructor(
+    engine: "bing" | "brave" | "mojeek" | "ecosia",
+    execFn: (cmd: string, args: string[], opts?: any) => Promise<{ stdout: string; stderr: string; code: number }>,
+    scriptDir: string,
+  ) {
+    this.engine = engine;
+    this.name = `Playwright/${engine}`;
+    this.execFn = execFn;
+    this.scriptPath = `${scriptDir}/browser-search.py`;
+    this.venvPath = `${process.env.HOME}/.local/share/pi/playwright-env`;
+  }
+
+  /**
+   * Ensure the Python venv with playwright exists.
+   * Creates it on first use, reuses on subsequent calls.
+   */
+  private async ensureVenv(): Promise<string> {
+    const pythonPath = `${this.venvPath}/bin/python3`;
+
+    // Check if venv already exists and has playwright
+    try {
+      const check = await this.execFn(pythonPath, ["-c", "import playwright"], { timeout: 5000 });
+      if (check.code === 0) return pythonPath;
+    } catch {
+      // venv doesn't exist or is broken, create it
+    }
+
+    // Create venv and install playwright
+    const uv = await this.execFn("which", ["uv"]);
+    if (uv.code !== 0) {
+      throw new Error("uv not found — required to bootstrap playwright venv");
+    }
+
+    const create = await this.execFn("uv", ["venv", this.venvPath], { timeout: 30000 });
+    if (create.code !== 0) {
+      throw new Error(`Failed to create venv: ${create.stderr}`);
+    }
+
+    const install = await this.execFn(
+      "uv", ["pip", "install", "--python", pythonPath, "playwright"],
+      { timeout: 60000 },
+    );
+    if (install.code !== 0) {
+      throw new Error(`Failed to install playwright: ${install.stderr}`);
+    }
+
+    // Install browser (only if no system Chrome available)
+    const chromeCheck = await this.execFn("test", ["-x", "/run/current-system/sw/bin/google-chrome-stable"]);
+    if (chromeCheck.code !== 0) {
+      const browserInstall = await this.execFn(
+        `${this.venvPath}/bin/playwright`, ["install", "chromium"],
+        { timeout: 120000 },
+      );
+      if (browserInstall.code !== 0) {
+        throw new Error(`Failed to install Playwright browser: ${browserInstall.stderr}`);
+      }
+    }
+
+    return pythonPath;
+  }
+
+  async isAvailable(): Promise<boolean> {
+    // Check if system Chrome exists (preferred) or if we can bootstrap
+    try {
+      const chrome = await this.execFn("test", ["-x", "/run/current-system/sw/bin/google-chrome-stable"]);
+      if (chrome.code === 0) return true;
+
+      const uv = await this.execFn("which", ["uv"]);
+      return uv.code === 0;
+    } catch {
+      return false;
+    }
+  }
+
+  async search(query: string, maxResults: number, signal?: AbortSignal): Promise<SearchResult[]> {
+    const pythonPath = await this.ensureVenv();
+
+    const result = await this.execFn(
+      pythonPath,
+      [this.scriptPath, this.engine, String(maxResults), query],
+      { signal, timeout: 30000 },
+    );
+
+    if (result.code !== 0) {
+      throw new Error(`Playwright/${this.engine} search failed: ${result.stderr}`);
+    }
+
+    const data = JSON.parse(result.stdout);
+    return (data || []).map((r: any) => ({
+      title: r.title || "(no title)",
+      url: r.url || "",
+      snippet: r.snippet || "",
+    }));
+  }
+}
+
 /**
  * Multi-backend search with automatic fallback.
  *
dots/pi/agent/extensions/search/browser-search.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Playwright-based web search using system Chrome.
+
+Headless browser search fallback for when API-based backends (SearXNG, ddgr) fail.
+Supports multiple engines: Bing, Brave, Mojeek, Ecosia.
+
+Usage:
+  browser-search.py <engine> <max_results> <query...>
+
+Output: JSON array of {title, url, snippet} objects on stdout.
+Errors go to stderr.
+"""
+import json
+import sys
+import urllib.parse
+
+
+def extract_bing_url(href: str) -> str:
+    """Extract the real URL from a Bing click-tracking redirect."""
+    if "/ck/a?" not in href:
+        return href
+    # The real URL is base64-encoded in the 'u' parameter as 'a1<base64>'
+    parsed = urllib.parse.urlparse(href)
+    params = urllib.parse.parse_qs(parsed.query)
+    if "u" in params:
+        u_val = params["u"][0]
+        if u_val.startswith("a1"):
+            import base64
+            try:
+                return base64.b64decode(u_val[2:] + "==").decode("utf-8", errors="replace")
+            except Exception:
+                pass
+    return href
+
+
+def search_bing(page, query: str, max_results: int) -> list[dict]:
+    """Search Bing and extract results from li.b_algo elements."""
+    url = f"https://www.bing.com/search?q={urllib.parse.quote_plus(query)}&count={max_results}"
+    page.goto(url, wait_until="domcontentloaded", timeout=15000)
+    page.wait_for_timeout(2000)
+
+    results = []
+    items = page.query_selector_all("li.b_algo")
+
+    for item in items[:max_results]:
+        try:
+            link_el = item.query_selector("h2 a")
+            snippet_el = item.query_selector("p, .b_caption p, .b_lineclamp2")
+
+            if not link_el:
+                continue
+
+            href = link_el.get_attribute("href") or ""
+            if not href.startswith("http"):
+                continue
+
+            title = link_el.inner_text().strip()
+            snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+            results.append({
+                "title": title,
+                "url": extract_bing_url(href),
+                "snippet": snippet,
+            })
+        except Exception:
+            continue
+
+    return results
+
+
+def search_brave(page, query: str, max_results: int) -> list[dict]:
+    """Search Brave and extract results."""
+    url = f"https://search.brave.com/search?q={urllib.parse.quote_plus(query)}"
+    page.goto(url, wait_until="domcontentloaded", timeout=15000)
+    page.wait_for_timeout(3000)
+
+    results = []
+
+    # Try multiple selector patterns (Brave changes layout)
+    items = page.query_selector_all("div#results div.snippet[data-type='web']")
+    if not items:
+        items = page.query_selector_all("div.snippet")
+
+    for item in items[:max_results]:
+        try:
+            link_el = item.query_selector("a")
+            title_el = item.query_selector("div.title, span.snippet-title")
+            snippet_el = item.query_selector(
+                "div.snippet-description, p.snippet-description"
+            )
+
+            if not link_el:
+                continue
+            href = link_el.get_attribute("href") or ""
+            if not href.startswith("http"):
+                continue
+
+            title = title_el.inner_text().strip() if title_el else ""
+            if not title:
+                title = link_el.inner_text().strip()
+            snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+            results.append({"title": title, "url": href, "snippet": snippet})
+        except Exception:
+            continue
+
+    return results
+
+
+def search_mojeek(page, query: str, max_results: int) -> list[dict]:
+    """Search Mojeek — independent search engine, no CAPTCHA in headless."""
+    url = f"https://www.mojeek.com/search?q={urllib.parse.quote_plus(query)}"
+    page.goto(url, wait_until="domcontentloaded", timeout=15000)
+    page.wait_for_timeout(2000)
+
+    results = []
+    items = page.query_selector_all("ul.results-standard li")
+
+    for item in items[:max_results]:
+        try:
+            # a.title inside h2 has the real page title
+            title_el = item.query_selector("a.title")
+            # a.ob has the href
+            link_el = item.query_selector("a.ob")
+            snippet_el = item.query_selector("p.s")
+
+            if not link_el:
+                continue
+
+            href = link_el.get_attribute("href") or ""
+            if not href.startswith("http"):
+                continue
+
+            title = title_el.inner_text().strip() if title_el else ""
+            if not title:
+                title = href
+            snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+            results.append({"title": title, "url": href, "snippet": snippet})
+        except Exception:
+            continue
+
+    return results
+
+
+def search_ecosia(page, query: str, max_results: int) -> list[dict]:
+    """Search Ecosia — Bing-powered, eco-friendly search engine."""
+    url = f"https://www.ecosia.org/search?method=index&q={urllib.parse.quote_plus(query)}"
+    page.goto(url, wait_until="domcontentloaded", timeout=15000)
+    page.wait_for_timeout(3000)
+
+    results = []
+    articles = page.query_selector_all("article.result")
+
+    skip_domains = {"ecosia.org", "google.com"}
+
+    for article in articles[:max_results]:
+        try:
+            # Each article has multiple links; filter out ecosia/google support links
+            links = article.query_selector_all("a[href^='http']")
+            href = ""
+            title = ""
+            for a in links:
+                h = a.get_attribute("href") or ""
+                # Skip Ecosia and Google support/tracking links
+                if any(d in h for d in skip_domains):
+                    continue
+                href = h
+                # The last matching link with text is typically the title link
+                text = a.inner_text().strip()
+                if text and not text.startswith("http"):
+                    title = text
+                break
+
+            if not href:
+                continue
+
+            # If title wasn't found from link text, try h2/h3
+            if not title:
+                h_el = article.query_selector("h2, h3")
+                title = h_el.inner_text().strip() if h_el else href
+
+            snippet_el = article.query_selector("p, .result__snippet")
+            snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+            results.append({"title": title, "url": href, "snippet": snippet})
+        except Exception:
+            continue
+
+    return results
+
+
+ENGINES = {
+    "bing": search_bing,
+    "brave": search_brave,
+    "mojeek": search_mojeek,
+    "ecosia": search_ecosia,
+}
+
+
+def main():
+    if len(sys.argv) < 4:
+        print(
+            "Usage: browser-search.py <engine> <max_results> <query...>",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    engine = sys.argv[1].lower()
+    max_results = int(sys.argv[2])
+    query = " ".join(sys.argv[3:])
+
+    if engine not in ENGINES:
+        print(f"Unknown engine: {engine}. Available: {', '.join(ENGINES)}", file=sys.stderr)
+        sys.exit(1)
+
+    # Determine Chrome path — prefer system Chrome on NixOS
+    chrome_paths = [
+        "/run/current-system/sw/bin/google-chrome-stable",
+        "/usr/bin/google-chrome-stable",
+        "/usr/bin/chromium-browser",
+        "/usr/bin/chromium",
+    ]
+    chrome_path = None
+    import os
+    for p in chrome_paths:
+        if os.path.isfile(p):
+            chrome_path = p
+            break
+
+    from playwright.sync_api import sync_playwright
+
+    with sync_playwright() as p:
+        launch_args = {
+            "headless": True,
+            "args": ["--no-sandbox", "--disable-dev-shm-usage"],
+        }
+        if chrome_path:
+            launch_args["executable_path"] = chrome_path
+
+        browser = p.chromium.launch(**launch_args)
+        try:
+            page = browser.new_page(
+                user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
+            )
+
+            search_fn = ENGINES[engine]
+            results = search_fn(page, query, max_results)
+
+            json.dump(results, sys.stdout)
+        finally:
+            browser.close()
+
+
+if __name__ == "__main__":
+    main()
dots/pi/agent/extensions/search/index.ts
@@ -1,11 +1,17 @@
 /**
  * Multi-backend Search Extension for Pi
  *
- * Provides web search, GitHub code search, and Stack Overflow search tools.
+ * Provides web search and GitHub code search tools.
  * Web search uses multiple backends with automatic fallback:
  *   1. SearXNG (self-hosted, primary)
- *   2. ddgr CLI (DuckDuckGo via CLI, if installed)
- *   3. DuckDuckGo Instant Answer API (always available fallback)
+ *   2. ddgr CLI (DuckDuckGo via CLI)
+ *   3. Playwright/Bing (headless Chrome browser search fallback)
+ *   4. DuckDuckGo Instant Answer API (always available, limited results)
+ *
+ * Additional backends available via /search-backend:
+ *   - brave: Playwright/Brave Search (rate-limited after a few queries)
+ *   - mojeek: Playwright/Mojeek (independent engine, no CAPTCHA)
+ *   - ecosia: Playwright/Ecosia (Bing-powered, good results)
  *
  * Configuration via environment variables:
  *   SEARXNG_URL - SearXNG instance URL (default: https://search.sbr.pm)
@@ -18,6 +24,7 @@ import {
   SearXNGBackend,
   DuckDuckGoBackend,
   DdgrBackend,
+  PlaywrightBackend,
   searchWithFallback,
   formatResults,
   type SearchBackend,
@@ -31,15 +38,24 @@ function truncate(text: string, maxLength: number): string {
 export default function (pi: ExtensionAPI) {
   // Configure backends
   const searxngUrl = process.env.SEARXNG_URL || "https://search.sbr.pm";
+  // __dirname works in jiti (pi's TypeScript loader)
+  const extensionDir = __dirname;
+  const execFn = (cmd: string, args: string[], opts?: any) => pi.exec(cmd, args, opts);
 
   const allBackends: Record<string, SearchBackend> = {
     searxng: new SearXNGBackend(searxngUrl),
-    ddgr: new DdgrBackend((cmd, args, opts) => pi.exec(cmd, args, opts)),
+    ddgr: new DdgrBackend(execFn),
+    bing: new PlaywrightBackend("bing", execFn, extensionDir),
+    brave: new PlaywrightBackend("brave", execFn, extensionDir),
+    mojeek: new PlaywrightBackend("mojeek", execFn, extensionDir),
+    ecosia: new PlaywrightBackend("ecosia", execFn, extensionDir),
     duckduckgo: new DuckDuckGoBackend(),
   };
 
-  // Active backends in priority order (all enabled by default)
-  let activeBackendNames = ["searxng", "ddgr", "duckduckgo"];
+  // Active backends in priority order
+  // SearXNG first, then Playwright browsers (Bing most reliable, then Mojeek/Ecosia),
+  // then ddgr CLI, then DDG API as last resort. Brave available via /search-backend.
+  let activeBackendNames = ["searxng", "bing", "mojeek", "ecosia", "ddgr", "duckduckgo"];
 
   // Helper to get current active backends
   const getActiveBackends = (): SearchBackend[] =>
overlays/default.nix
@@ -11,10 +11,18 @@ in
       inherit globals;
     };
   modifications = final: prev: {
-    # example = prev.example.overrideAttrs (oldAttrs: rec {
-    # ...
-    # });
-    # custom-caddy = import ./custom-caddy.nix { pkgs = prev; };
+    # Pin SearXNG to latest commit for critical engine fixes:
+    # - Google useragent bypass (#5892, 2026-03-23)
+    # - Bing engine rework (#5793, 2026-03-18)
+    searxng = prev.searxng.overrideAttrs (_oldAttrs: {
+      version = "0-unstable-2026-03-23";
+      src = final.fetchFromGitHub {
+        owner = "searxng";
+        repo = "searxng";
+        rev = "2c1ce3bd37a2";
+        hash = "sha256-Ml7aJXr4IudH/ZnlXDU8OdnUELBb6/vcpAAhAXwhW8A=";
+      };
+    });
     go_1_25_8 = prev.go_1_25.overrideAttrs (_finalAttrs: {
       version = "1.25.8";
       src = final.fetchurl {