browser-search.py

  1#!/usr/bin/env python3
  2"""
  3Playwright-based web search using system Chrome.
  4
  5Headless browser search fallback for when API-based backends (SearXNG, ddgr) fail.
  6Supports multiple engines: Bing, Brave, Mojeek, Ecosia.
  7
  8Usage:
  9  browser-search.py <engine> <max_results> <query...>
 10
 11Output: JSON array of {title, url, snippet} objects on stdout.
 12Errors go to stderr.
 13"""
 14import json
 15import sys
 16import urllib.parse
 17
 18
 19def extract_bing_url(href: str) -> str:
 20    """Extract the real URL from a Bing click-tracking redirect."""
 21    if "/ck/a?" not in href:
 22        return href
 23    # The real URL is base64-encoded in the 'u' parameter as 'a1<base64>'
 24    parsed = urllib.parse.urlparse(href)
 25    params = urllib.parse.parse_qs(parsed.query)
 26    if "u" in params:
 27        u_val = params["u"][0]
 28        if u_val.startswith("a1"):
 29            import base64
 30            try:
 31                return base64.b64decode(u_val[2:] + "==").decode("utf-8", errors="replace")
 32            except Exception:
 33                pass
 34    return href
 35
 36
 37def search_bing(page, query: str, max_results: int) -> list[dict]:
 38    """Search Bing and extract results from li.b_algo elements."""
 39    url = f"https://www.bing.com/search?q={urllib.parse.quote_plus(query)}&count={max_results}"
 40    page.goto(url, wait_until="domcontentloaded", timeout=15000)
 41    page.wait_for_timeout(2000)
 42
 43    results = []
 44    items = page.query_selector_all("li.b_algo")
 45
 46    for item in items[:max_results]:
 47        try:
 48            link_el = item.query_selector("h2 a")
 49            snippet_el = item.query_selector("p, .b_caption p, .b_lineclamp2")
 50
 51            if not link_el:
 52                continue
 53
 54            href = link_el.get_attribute("href") or ""
 55            if not href.startswith("http"):
 56                continue
 57
 58            title = link_el.inner_text().strip()
 59            snippet = snippet_el.inner_text().strip() if snippet_el else ""
 60
 61            results.append({
 62                "title": title,
 63                "url": extract_bing_url(href),
 64                "snippet": snippet,
 65            })
 66        except Exception:
 67            continue
 68
 69    return results
 70
 71
 72def search_brave(page, query: str, max_results: int) -> list[dict]:
 73    """Search Brave and extract results."""
 74    url = f"https://search.brave.com/search?q={urllib.parse.quote_plus(query)}"
 75    page.goto(url, wait_until="domcontentloaded", timeout=15000)
 76    page.wait_for_timeout(3000)
 77
 78    results = []
 79
 80    # Try multiple selector patterns (Brave changes layout)
 81    items = page.query_selector_all("div#results div.snippet[data-type='web']")
 82    if not items:
 83        items = page.query_selector_all("div.snippet")
 84
 85    for item in items[:max_results]:
 86        try:
 87            link_el = item.query_selector("a")
 88            title_el = item.query_selector("div.title, span.snippet-title")
 89            snippet_el = item.query_selector(
 90                "div.snippet-description, p.snippet-description"
 91            )
 92
 93            if not link_el:
 94                continue
 95            href = link_el.get_attribute("href") or ""
 96            if not href.startswith("http"):
 97                continue
 98
 99            title = title_el.inner_text().strip() if title_el else ""
100            if not title:
101                title = link_el.inner_text().strip()
102            snippet = snippet_el.inner_text().strip() if snippet_el else ""
103
104            results.append({"title": title, "url": href, "snippet": snippet})
105        except Exception:
106            continue
107
108    return results
109
110
111def search_mojeek(page, query: str, max_results: int) -> list[dict]:
112    """Search Mojeek — independent search engine, no CAPTCHA in headless."""
113    url = f"https://www.mojeek.com/search?q={urllib.parse.quote_plus(query)}"
114    page.goto(url, wait_until="domcontentloaded", timeout=15000)
115    page.wait_for_timeout(2000)
116
117    results = []
118    items = page.query_selector_all("ul.results-standard li")
119
120    for item in items[:max_results]:
121        try:
122            # a.title inside h2 has the real page title
123            title_el = item.query_selector("a.title")
124            # a.ob has the href
125            link_el = item.query_selector("a.ob")
126            snippet_el = item.query_selector("p.s")
127
128            if not link_el:
129                continue
130
131            href = link_el.get_attribute("href") or ""
132            if not href.startswith("http"):
133                continue
134
135            title = title_el.inner_text().strip() if title_el else ""
136            if not title:
137                title = href
138            snippet = snippet_el.inner_text().strip() if snippet_el else ""
139
140            results.append({"title": title, "url": href, "snippet": snippet})
141        except Exception:
142            continue
143
144    return results
145
146
147def search_ecosia(page, query: str, max_results: int) -> list[dict]:
148    """Search Ecosia — Bing-powered, eco-friendly search engine."""
149    url = f"https://www.ecosia.org/search?method=index&q={urllib.parse.quote_plus(query)}"
150    page.goto(url, wait_until="domcontentloaded", timeout=15000)
151    page.wait_for_timeout(3000)
152
153    results = []
154    articles = page.query_selector_all("article.result")
155
156    skip_domains = {"ecosia.org", "google.com"}
157
158    for article in articles[:max_results]:
159        try:
160            # Each article has multiple links; filter out ecosia/google support links
161            links = article.query_selector_all("a[href^='http']")
162            href = ""
163            title = ""
164            for a in links:
165                h = a.get_attribute("href") or ""
166                # Skip Ecosia and Google support/tracking links
167                if any(d in h for d in skip_domains):
168                    continue
169                href = h
170                # The last matching link with text is typically the title link
171                text = a.inner_text().strip()
172                if text and not text.startswith("http"):
173                    title = text
174                break
175
176            if not href:
177                continue
178
179            # If title wasn't found from link text, try h2/h3
180            if not title:
181                h_el = article.query_selector("h2, h3")
182                title = h_el.inner_text().strip() if h_el else href
183
184            snippet_el = article.query_selector("p, .result__snippet")
185            snippet = snippet_el.inner_text().strip() if snippet_el else ""
186
187            results.append({"title": title, "url": href, "snippet": snippet})
188        except Exception:
189            continue
190
191    return results
192
193
194ENGINES = {
195    "bing": search_bing,
196    "brave": search_brave,
197    "mojeek": search_mojeek,
198    "ecosia": search_ecosia,
199}
200
201
202def main():
203    if len(sys.argv) < 4:
204        print(
205            "Usage: browser-search.py <engine> <max_results> <query...>",
206            file=sys.stderr,
207        )
208        sys.exit(1)
209
210    engine = sys.argv[1].lower()
211    max_results = int(sys.argv[2])
212    query = " ".join(sys.argv[3:])
213
214    if engine not in ENGINES:
215        print(f"Unknown engine: {engine}. Available: {', '.join(ENGINES)}", file=sys.stderr)
216        sys.exit(1)
217
218    # Determine Chrome path — prefer system Chrome on NixOS
219    chrome_paths = [
220        "/run/current-system/sw/bin/google-chrome-stable",
221        "/usr/bin/google-chrome-stable",
222        "/usr/bin/chromium-browser",
223        "/usr/bin/chromium",
224    ]
225    chrome_path = None
226    import os
227    for p in chrome_paths:
228        if os.path.isfile(p):
229            chrome_path = p
230            break
231
232    from playwright.sync_api import sync_playwright
233
234    with sync_playwright() as p:
235        launch_args = {
236            "headless": True,
237            "args": ["--no-sandbox", "--disable-dev-shm-usage"],
238        }
239        if chrome_path:
240            launch_args["executable_path"] = chrome_path
241
242        browser = p.chromium.launch(**launch_args)
243        try:
244            page = browser.new_page(
245                user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
246                "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
247            )
248
249            search_fn = ENGINES[engine]
250            results = search_fn(page, query, max_results)
251
252            json.dump(results, sys.stdout)
253        finally:
254            browser.close()
255
256
257if __name__ == "__main__":
258    main()