main
1#!/usr/bin/env python3
2"""
3Playwright-based web search using system Chrome.
4
5Headless browser search fallback for when API-based backends (SearXNG, ddgr) fail.
6Supports multiple engines: Bing, Brave, Mojeek, Ecosia.
7
8Usage:
9 browser-search.py <engine> <max_results> <query...>
10
11Output: JSON array of {title, url, snippet} objects on stdout.
12Errors go to stderr.
13"""
14import json
15import sys
16import urllib.parse
17
18
19def extract_bing_url(href: str) -> str:
20 """Extract the real URL from a Bing click-tracking redirect."""
21 if "/ck/a?" not in href:
22 return href
23 # The real URL is base64-encoded in the 'u' parameter as 'a1<base64>'
24 parsed = urllib.parse.urlparse(href)
25 params = urllib.parse.parse_qs(parsed.query)
26 if "u" in params:
27 u_val = params["u"][0]
28 if u_val.startswith("a1"):
29 import base64
30 try:
31 return base64.b64decode(u_val[2:] + "==").decode("utf-8", errors="replace")
32 except Exception:
33 pass
34 return href
35
36
37def search_bing(page, query: str, max_results: int) -> list[dict]:
38 """Search Bing and extract results from li.b_algo elements."""
39 url = f"https://www.bing.com/search?q={urllib.parse.quote_plus(query)}&count={max_results}"
40 page.goto(url, wait_until="domcontentloaded", timeout=15000)
41 page.wait_for_timeout(2000)
42
43 results = []
44 items = page.query_selector_all("li.b_algo")
45
46 for item in items[:max_results]:
47 try:
48 link_el = item.query_selector("h2 a")
49 snippet_el = item.query_selector("p, .b_caption p, .b_lineclamp2")
50
51 if not link_el:
52 continue
53
54 href = link_el.get_attribute("href") or ""
55 if not href.startswith("http"):
56 continue
57
58 title = link_el.inner_text().strip()
59 snippet = snippet_el.inner_text().strip() if snippet_el else ""
60
61 results.append({
62 "title": title,
63 "url": extract_bing_url(href),
64 "snippet": snippet,
65 })
66 except Exception:
67 continue
68
69 return results
70
71
72def search_brave(page, query: str, max_results: int) -> list[dict]:
73 """Search Brave and extract results."""
74 url = f"https://search.brave.com/search?q={urllib.parse.quote_plus(query)}"
75 page.goto(url, wait_until="domcontentloaded", timeout=15000)
76 page.wait_for_timeout(3000)
77
78 results = []
79
80 # Try multiple selector patterns (Brave changes layout)
81 items = page.query_selector_all("div#results div.snippet[data-type='web']")
82 if not items:
83 items = page.query_selector_all("div.snippet")
84
85 for item in items[:max_results]:
86 try:
87 link_el = item.query_selector("a")
88 title_el = item.query_selector("div.title, span.snippet-title")
89 snippet_el = item.query_selector(
90 "div.snippet-description, p.snippet-description"
91 )
92
93 if not link_el:
94 continue
95 href = link_el.get_attribute("href") or ""
96 if not href.startswith("http"):
97 continue
98
99 title = title_el.inner_text().strip() if title_el else ""
100 if not title:
101 title = link_el.inner_text().strip()
102 snippet = snippet_el.inner_text().strip() if snippet_el else ""
103
104 results.append({"title": title, "url": href, "snippet": snippet})
105 except Exception:
106 continue
107
108 return results
109
110
111def search_mojeek(page, query: str, max_results: int) -> list[dict]:
112 """Search Mojeek — independent search engine, no CAPTCHA in headless."""
113 url = f"https://www.mojeek.com/search?q={urllib.parse.quote_plus(query)}"
114 page.goto(url, wait_until="domcontentloaded", timeout=15000)
115 page.wait_for_timeout(2000)
116
117 results = []
118 items = page.query_selector_all("ul.results-standard li")
119
120 for item in items[:max_results]:
121 try:
122 # a.title inside h2 has the real page title
123 title_el = item.query_selector("a.title")
124 # a.ob has the href
125 link_el = item.query_selector("a.ob")
126 snippet_el = item.query_selector("p.s")
127
128 if not link_el:
129 continue
130
131 href = link_el.get_attribute("href") or ""
132 if not href.startswith("http"):
133 continue
134
135 title = title_el.inner_text().strip() if title_el else ""
136 if not title:
137 title = href
138 snippet = snippet_el.inner_text().strip() if snippet_el else ""
139
140 results.append({"title": title, "url": href, "snippet": snippet})
141 except Exception:
142 continue
143
144 return results
145
146
147def search_ecosia(page, query: str, max_results: int) -> list[dict]:
148 """Search Ecosia — Bing-powered, eco-friendly search engine."""
149 url = f"https://www.ecosia.org/search?method=index&q={urllib.parse.quote_plus(query)}"
150 page.goto(url, wait_until="domcontentloaded", timeout=15000)
151 page.wait_for_timeout(3000)
152
153 results = []
154 articles = page.query_selector_all("article.result")
155
156 skip_domains = {"ecosia.org", "google.com"}
157
158 for article in articles[:max_results]:
159 try:
160 # Each article has multiple links; filter out ecosia/google support links
161 links = article.query_selector_all("a[href^='http']")
162 href = ""
163 title = ""
164 for a in links:
165 h = a.get_attribute("href") or ""
166 # Skip Ecosia and Google support/tracking links
167 if any(d in h for d in skip_domains):
168 continue
169 href = h
170 # The last matching link with text is typically the title link
171 text = a.inner_text().strip()
172 if text and not text.startswith("http"):
173 title = text
174 break
175
176 if not href:
177 continue
178
179 # If title wasn't found from link text, try h2/h3
180 if not title:
181 h_el = article.query_selector("h2, h3")
182 title = h_el.inner_text().strip() if h_el else href
183
184 snippet_el = article.query_selector("p, .result__snippet")
185 snippet = snippet_el.inner_text().strip() if snippet_el else ""
186
187 results.append({"title": title, "url": href, "snippet": snippet})
188 except Exception:
189 continue
190
191 return results
192
193
194ENGINES = {
195 "bing": search_bing,
196 "brave": search_brave,
197 "mojeek": search_mojeek,
198 "ecosia": search_ecosia,
199}
200
201
202def main():
203 if len(sys.argv) < 4:
204 print(
205 "Usage: browser-search.py <engine> <max_results> <query...>",
206 file=sys.stderr,
207 )
208 sys.exit(1)
209
210 engine = sys.argv[1].lower()
211 max_results = int(sys.argv[2])
212 query = " ".join(sys.argv[3:])
213
214 if engine not in ENGINES:
215 print(f"Unknown engine: {engine}. Available: {', '.join(ENGINES)}", file=sys.stderr)
216 sys.exit(1)
217
218 # Determine Chrome path — prefer system Chrome on NixOS
219 chrome_paths = [
220 "/run/current-system/sw/bin/google-chrome-stable",
221 "/usr/bin/google-chrome-stable",
222 "/usr/bin/chromium-browser",
223 "/usr/bin/chromium",
224 ]
225 chrome_path = None
226 import os
227 for p in chrome_paths:
228 if os.path.isfile(p):
229 chrome_path = p
230 break
231
232 from playwright.sync_api import sync_playwright
233
234 with sync_playwright() as p:
235 launch_args = {
236 "headless": True,
237 "args": ["--no-sandbox", "--disable-dev-shm-usage"],
238 }
239 if chrome_path:
240 launch_args["executable_path"] = chrome_path
241
242 browser = p.chromium.launch(**launch_args)
243 try:
244 page = browser.new_page(
245 user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
246 "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
247 )
248
249 search_fn = ENGINES[engine]
250 results = search_fn(page, query, max_results)
251
252 json.dump(results, sys.stdout)
253 finally:
254 browser.close()
255
256
257if __name__ == "__main__":
258 main()