Commit f375c16395d8
Changed files (4)
dots
pi
agent
extensions
search
overlays
dots/pi/agent/extensions/search/backends.ts
@@ -150,6 +150,118 @@ export class DdgrBackend implements SearchBackend {
}
}
+/**
+ * Playwright browser search backend - uses headless Chrome via a Python script.
+ * Fallback when API-based backends fail due to rate limiting or bot detection.
+ * Supports Bing (reliable, no CAPTCHA) and Brave Search (rate-limited after a few queries).
+ *
+ * Requirements:
+ * - Python 3 with playwright package (auto-bootstrapped in ~/.local/share/pi/playwright-env)
+ * - System Chrome (google-chrome-stable) or Playwright's bundled Chromium
+ */
+export class PlaywrightBackend implements SearchBackend {
+ name: string;
+ private engine: string;
+ private execFn: (cmd: string, args: string[], opts?: any) => Promise<{ stdout: string; stderr: string; code: number }>;
+ private scriptPath: string;
+ private venvPath: string;
+
+ constructor(
+ engine: "bing" | "brave" | "mojeek" | "ecosia",
+ execFn: (cmd: string, args: string[], opts?: any) => Promise<{ stdout: string; stderr: string; code: number }>,
+ scriptDir: string,
+ ) {
+ this.engine = engine;
+ this.name = `Playwright/${engine}`;
+ this.execFn = execFn;
+ this.scriptPath = `${scriptDir}/browser-search.py`;
+ this.venvPath = `${process.env.HOME}/.local/share/pi/playwright-env`;
+ }
+
+ /**
+ * Ensure the Python venv with playwright exists.
+ * Creates it on first use, reuses on subsequent calls.
+ */
+ private async ensureVenv(): Promise<string> {
+ const pythonPath = `${this.venvPath}/bin/python3`;
+
+ // Check if venv already exists and has playwright
+ try {
+ const check = await this.execFn(pythonPath, ["-c", "import playwright"], { timeout: 5000 });
+ if (check.code === 0) return pythonPath;
+ } catch {
+ // venv doesn't exist or is broken, create it
+ }
+
+ // Create venv and install playwright
+ const uv = await this.execFn("which", ["uv"]);
+ if (uv.code !== 0) {
+ throw new Error("uv not found — required to bootstrap playwright venv");
+ }
+
+ const create = await this.execFn("uv", ["venv", this.venvPath], { timeout: 30000 });
+ if (create.code !== 0) {
+ throw new Error(`Failed to create venv: ${create.stderr}`);
+ }
+
+ const install = await this.execFn(
+ "uv", ["pip", "install", "--python", pythonPath, "playwright"],
+ { timeout: 60000 },
+ );
+ if (install.code !== 0) {
+ throw new Error(`Failed to install playwright: ${install.stderr}`);
+ }
+
+ // Install browser (only if no system Chrome available)
+ const chromeCheck = await this.execFn("test", ["-x", "/run/current-system/sw/bin/google-chrome-stable"]);
+ if (chromeCheck.code !== 0) {
+ const browserInstall = await this.execFn(
+ `${this.venvPath}/bin/playwright`, ["install", "chromium"],
+ { timeout: 120000 },
+ );
+ if (browserInstall.code !== 0) {
+ throw new Error(`Failed to install Playwright browser: ${browserInstall.stderr}`);
+ }
+ }
+
+ return pythonPath;
+ }
+
+ async isAvailable(): Promise<boolean> {
+ // Check if system Chrome exists (preferred) or if we can bootstrap
+ try {
+ const chrome = await this.execFn("test", ["-x", "/run/current-system/sw/bin/google-chrome-stable"]);
+ if (chrome.code === 0) return true;
+
+ const uv = await this.execFn("which", ["uv"]);
+ return uv.code === 0;
+ } catch {
+ return false;
+ }
+ }
+
+ async search(query: string, maxResults: number, signal?: AbortSignal): Promise<SearchResult[]> {
+ const pythonPath = await this.ensureVenv();
+
+ const result = await this.execFn(
+ pythonPath,
+ [this.scriptPath, this.engine, String(maxResults), query],
+ { signal, timeout: 30000 },
+ );
+
+ if (result.code !== 0) {
+ throw new Error(`Playwright/${this.engine} search failed: ${result.stderr}`);
+ }
+
+ const data = JSON.parse(result.stdout);
+ return (data || []).map((r: any) => ({
+ title: r.title || "(no title)",
+ url: r.url || "",
+ snippet: r.snippet || "",
+ }));
+ }
+}
+
/**
* Multi-backend search with automatic fallback.
*
dots/pi/agent/extensions/search/browser-search.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Playwright-based web search using system Chrome.
+
+Headless browser search fallback for when API-based backends (SearXNG, ddgr) fail.
+Supports multiple engines: Bing, Brave, Mojeek, Ecosia.
+
+Usage:
+ browser-search.py <engine> <max_results> <query...>
+
+Output: JSON array of {title, url, snippet} objects on stdout.
+Errors go to stderr.
+"""
+import json
+import sys
+import urllib.parse
+
+
+def extract_bing_url(href: str) -> str:
+ """Extract the real URL from a Bing click-tracking redirect."""
+ if "/ck/a?" not in href:
+ return href
+ # The real URL is base64-encoded in the 'u' parameter as 'a1<base64>'
+ parsed = urllib.parse.urlparse(href)
+ params = urllib.parse.parse_qs(parsed.query)
+ if "u" in params:
+ u_val = params["u"][0]
+ if u_val.startswith("a1"):
+ import base64
+ try:
+ return base64.b64decode(u_val[2:] + "==").decode("utf-8", errors="replace")
+ except Exception:
+ pass
+ return href
+
+
+def search_bing(page, query: str, max_results: int) -> list[dict]:
+ """Search Bing and extract results from li.b_algo elements."""
+ url = f"https://www.bing.com/search?q={urllib.parse.quote_plus(query)}&count={max_results}"
+ page.goto(url, wait_until="domcontentloaded", timeout=15000)
+ page.wait_for_timeout(2000)
+
+ results = []
+ items = page.query_selector_all("li.b_algo")
+
+ for item in items[:max_results]:
+ try:
+ link_el = item.query_selector("h2 a")
+ snippet_el = item.query_selector("p, .b_caption p, .b_lineclamp2")
+
+ if not link_el:
+ continue
+
+ href = link_el.get_attribute("href") or ""
+ if not href.startswith("http"):
+ continue
+
+ title = link_el.inner_text().strip()
+ snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+ results.append({
+ "title": title,
+ "url": extract_bing_url(href),
+ "snippet": snippet,
+ })
+ except Exception:
+ continue
+
+ return results
+
+
+def search_brave(page, query: str, max_results: int) -> list[dict]:
+ """Search Brave and extract results."""
+ url = f"https://search.brave.com/search?q={urllib.parse.quote_plus(query)}"
+ page.goto(url, wait_until="domcontentloaded", timeout=15000)
+ page.wait_for_timeout(3000)
+
+ results = []
+
+ # Try multiple selector patterns (Brave changes layout)
+ items = page.query_selector_all("div#results div.snippet[data-type='web']")
+ if not items:
+ items = page.query_selector_all("div.snippet")
+
+ for item in items[:max_results]:
+ try:
+ link_el = item.query_selector("a")
+ title_el = item.query_selector("div.title, span.snippet-title")
+ snippet_el = item.query_selector(
+ "div.snippet-description, p.snippet-description"
+ )
+
+ if not link_el:
+ continue
+ href = link_el.get_attribute("href") or ""
+ if not href.startswith("http"):
+ continue
+
+ title = title_el.inner_text().strip() if title_el else ""
+ if not title:
+ title = link_el.inner_text().strip()
+ snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+ results.append({"title": title, "url": href, "snippet": snippet})
+ except Exception:
+ continue
+
+ return results
+
+
+def search_mojeek(page, query: str, max_results: int) -> list[dict]:
+ """Search Mojeek — independent search engine, no CAPTCHA in headless."""
+ url = f"https://www.mojeek.com/search?q={urllib.parse.quote_plus(query)}"
+ page.goto(url, wait_until="domcontentloaded", timeout=15000)
+ page.wait_for_timeout(2000)
+
+ results = []
+ items = page.query_selector_all("ul.results-standard li")
+
+ for item in items[:max_results]:
+ try:
+ # a.title inside h2 has the real page title
+ title_el = item.query_selector("a.title")
+ # a.ob has the href
+ link_el = item.query_selector("a.ob")
+ snippet_el = item.query_selector("p.s")
+
+ if not link_el:
+ continue
+
+ href = link_el.get_attribute("href") or ""
+ if not href.startswith("http"):
+ continue
+
+ title = title_el.inner_text().strip() if title_el else ""
+ if not title:
+ title = href
+ snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+ results.append({"title": title, "url": href, "snippet": snippet})
+ except Exception:
+ continue
+
+ return results
+
+
+def search_ecosia(page, query: str, max_results: int) -> list[dict]:
+ """Search Ecosia — Bing-powered, eco-friendly search engine."""
+ url = f"https://www.ecosia.org/search?method=index&q={urllib.parse.quote_plus(query)}"
+ page.goto(url, wait_until="domcontentloaded", timeout=15000)
+ page.wait_for_timeout(3000)
+
+ results = []
+ articles = page.query_selector_all("article.result")
+
+ skip_domains = {"ecosia.org", "google.com"}
+
+ for article in articles[:max_results]:
+ try:
+ # Each article has multiple links; filter out ecosia/google support links
+ links = article.query_selector_all("a[href^='http']")
+ href = ""
+ title = ""
+ for a in links:
+ h = a.get_attribute("href") or ""
+ # Skip Ecosia and Google support/tracking links
+ if any(d in h for d in skip_domains):
+ continue
+ href = h
+ # The last matching link with text is typically the title link
+ text = a.inner_text().strip()
+ if text and not text.startswith("http"):
+ title = text
+ break
+
+ if not href:
+ continue
+
+ # If title wasn't found from link text, try h2/h3
+ if not title:
+ h_el = article.query_selector("h2, h3")
+ title = h_el.inner_text().strip() if h_el else href
+
+ snippet_el = article.query_selector("p, .result__snippet")
+ snippet = snippet_el.inner_text().strip() if snippet_el else ""
+
+ results.append({"title": title, "url": href, "snippet": snippet})
+ except Exception:
+ continue
+
+ return results
+
+
+ENGINES = {
+ "bing": search_bing,
+ "brave": search_brave,
+ "mojeek": search_mojeek,
+ "ecosia": search_ecosia,
+}
+
+
+def main():
+ if len(sys.argv) < 4:
+ print(
+ "Usage: browser-search.py <engine> <max_results> <query...>",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+ engine = sys.argv[1].lower()
+ max_results = int(sys.argv[2])
+ query = " ".join(sys.argv[3:])
+
+ if engine not in ENGINES:
+ print(f"Unknown engine: {engine}. Available: {', '.join(ENGINES)}", file=sys.stderr)
+ sys.exit(1)
+
+ # Determine Chrome path — prefer system Chrome on NixOS
+ chrome_paths = [
+ "/run/current-system/sw/bin/google-chrome-stable",
+ "/usr/bin/google-chrome-stable",
+ "/usr/bin/chromium-browser",
+ "/usr/bin/chromium",
+ ]
+ chrome_path = None
+ import os
+ for p in chrome_paths:
+ if os.path.isfile(p):
+ chrome_path = p
+ break
+
+ from playwright.sync_api import sync_playwright
+
+ with sync_playwright() as p:
+ launch_args = {
+ "headless": True,
+ "args": ["--no-sandbox", "--disable-dev-shm-usage"],
+ }
+ if chrome_path:
+ launch_args["executable_path"] = chrome_path
+
+ browser = p.chromium.launch(**launch_args)
+ try:
+ page = browser.new_page(
+ user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
+ )
+
+ search_fn = ENGINES[engine]
+ results = search_fn(page, query, max_results)
+
+ json.dump(results, sys.stdout)
+ finally:
+ browser.close()
+
+
+if __name__ == "__main__":
+ main()
dots/pi/agent/extensions/search/index.ts
@@ -1,11 +1,17 @@
/**
* Multi-backend Search Extension for Pi
*
- * Provides web search, GitHub code search, and Stack Overflow search tools.
+ * Provides web search and GitHub code search tools.
* Web search uses multiple backends with automatic fallback:
* 1. SearXNG (self-hosted, primary)
- * 2. ddgr CLI (DuckDuckGo via CLI, if installed)
- * 3. DuckDuckGo Instant Answer API (always available fallback)
+ * 2. ddgr CLI (DuckDuckGo via CLI)
+ * 3. Playwright/Bing (headless Chrome browser search fallback)
+ * 4. DuckDuckGo Instant Answer API (always available, limited results)
+ *
+ * Additional backends available via /search-backend:
+ * - brave: Playwright/Brave Search (rate-limited after a few queries)
+ * - mojeek: Playwright/Mojeek (independent engine, no CAPTCHA)
+ * - ecosia: Playwright/Ecosia (Bing-powered, good results)
*
* Configuration via environment variables:
* SEARXNG_URL - SearXNG instance URL (default: https://search.sbr.pm)
@@ -18,6 +24,7 @@ import {
SearXNGBackend,
DuckDuckGoBackend,
DdgrBackend,
+ PlaywrightBackend,
searchWithFallback,
formatResults,
type SearchBackend,
@@ -31,15 +38,24 @@ function truncate(text: string, maxLength: number): string {
export default function (pi: ExtensionAPI) {
// Configure backends
const searxngUrl = process.env.SEARXNG_URL || "https://search.sbr.pm";
+ // __dirname works in jiti (pi's TypeScript loader)
+ const extensionDir = __dirname;
+ const execFn = (cmd: string, args: string[], opts?: any) => pi.exec(cmd, args, opts);
const allBackends: Record<string, SearchBackend> = {
searxng: new SearXNGBackend(searxngUrl),
- ddgr: new DdgrBackend((cmd, args, opts) => pi.exec(cmd, args, opts)),
+ ddgr: new DdgrBackend(execFn),
+ bing: new PlaywrightBackend("bing", execFn, extensionDir),
+ brave: new PlaywrightBackend("brave", execFn, extensionDir),
+ mojeek: new PlaywrightBackend("mojeek", execFn, extensionDir),
+ ecosia: new PlaywrightBackend("ecosia", execFn, extensionDir),
duckduckgo: new DuckDuckGoBackend(),
};
- // Active backends in priority order (all enabled by default)
- let activeBackendNames = ["searxng", "ddgr", "duckduckgo"];
+ // Active backends in priority order
+ // SearXNG first, then Playwright browsers (Bing most reliable, then Mojeek/Ecosia),
+ // then ddgr CLI, then DDG API as last resort. Brave available via /search-backend.
+ let activeBackendNames = ["searxng", "bing", "mojeek", "ecosia", "ddgr", "duckduckgo"];
// Helper to get current active backends
const getActiveBackends = (): SearchBackend[] =>
overlays/default.nix
@@ -11,10 +11,18 @@ in
inherit globals;
};
modifications = final: prev: {
- # example = prev.example.overrideAttrs (oldAttrs: rec {
- # ...
- # });
- # custom-caddy = import ./custom-caddy.nix { pkgs = prev; };
+ # Pin SearXNG to latest commit for critical engine fixes:
+ # - Google useragent bypass (#5892, 2026-03-23)
+ # - Bing engine rework (#5793, 2026-03-18)
+ searxng = prev.searxng.overrideAttrs (_oldAttrs: {
+ version = "0-unstable-2026-03-23";
+ src = final.fetchFromGitHub {
+ owner = "searxng";
+ repo = "searxng";
+ rev = "2c1ce3bd37a2";
+ hash = "sha256-Ml7aJXr4IudH/ZnlXDU8OdnUELBb6/vcpAAhAXwhW8A=";
+ };
+ });
go_1_25_8 = prev.go_1_25.overrideAttrs (_finalAttrs: {
version = "1.25.8";
src = final.fetchurl {