From 60e646170705a0afe5a4a56f45a21141cc73d95b Mon Sep 17 00:00:00 2001 From: zlei9 Date: Sun, 29 Mar 2026 13:18:55 +0800 Subject: [PATCH] Initial commit with translated description --- CHANGELOG.md | 536 ++++++++ FAQ.md | 263 ++++ README.md | 800 ++++++++++++ SKILL.md | 258 ++++ TROUBLESHOOTING.md | 315 +++++ _meta.json | 6 + config.example.json | 265 ++++ package.json | 88 ++ scripts/search.py | 2940 ++++++++++++++++++++++++++++++++++++++++++ scripts/setup.py | 453 +++++++ test-auto-routing.sh | 20 + 11 files changed, 5944 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 FAQ.md create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 TROUBLESHOOTING.md create mode 100644 _meta.json create mode 100644 config.example.json create mode 100644 package.json create mode 100644 scripts/search.py create mode 100644 scripts/setup.py create mode 100644 test-auto-routing.sh diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a75e361 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,536 @@ +# Changelog - Web Search Plus + +## [2.9.2] - 2026-03-27 + +### Fixed +- Replaced hardcoded temporary cache path examples with portable `$TMP_DIR` placeholders in `TROUBLESHOOTING.md` + +## [2.9.0] - 2026-03-12 + +### ✨ New Provider: Querit (Multilingual AI Search) + +[Querit.ai](https://querit.ai) is a Singapore-based multilingual AI search API purpose-built for LLMs and RAG pipelines. 300 billion page index, 20+ countries, 10+ languages. + +- Added **Querit** as the 7th search provider via `https://api.querit.ai/v1/search` +- Configure via `QUERIT_API_KEY` β€” optional, gracefully skipped if not set +- Routing score: `research * 0.65 + rag * 0.35 + recency * 0.45` β€” favored for multilingual and real-time queries +- Handles Querit's quirky `error_code=200` responses as success (not an error) +- Handles `IncompleteRead` as transient/retryable failure +- Live-tested with 10 benchmark queries βœ… + +### πŸ”§ Fixed: Fallback chain dies on unconfigured provider + +- `sys.exit(1)` in `validate_api_key()` raised `SystemExit` (inherits from `BaseException`), which bypassed the `except Exception` fallback loop and killed the entire process instead of trying the next provider +- Replaced with catchable `ProviderConfigError` β€” fallback chain now continues correctly through all configured providers + +### πŸ”§ Fixed: Perplexity citations are generic placeholders + +- Previously extracted citation URLs via regex from the answer text, resulting in generic "Source 1" / "Source 2" labels +- Now uses the structured `data["citations"]` array from the Perplexity API response directly β€” results have readable titles +- Regex extraction kept as fallback when API doesn't return a `citations` field + +### ✨ Improved: German locale routing patterns + +- Added German-language signal patterns for local and news queries +- Improves auto-routing for queries like `"aktuelle Nachrichten"`, `"beste Restaurants Graz"`, `"KI Regulierung Europa"` + +### πŸ“ Documentation + +- Added Querit to README provider tables, routing examples, and API key setup section +- Added `querit_api_key` to `config.example.json` +- Updated `SKILL.md` provider mentions and env metadata +- Bumped package version to `2.9.0` + + +## [2.8.6] - 2026-03-03 + +### Changed +- Documented Perplexity Sonar Pro usage and refreshed release docs. + + +## [2.8.5] - 2026-02-20 + +### ✨ Feature: Perplexity freshness filter + +- Added `freshness` parameter to Perplexity provider (`day`, `week`, `month`, `year`) +- Maps to Perplexity's native `search_recency_filter` parameter +- Example: `python3 scripts/search.py -p perplexity -q "latest AI news" --freshness day` +- Consistent with freshness support in Serper and Brave providers + +## [2.8.4] - 2026-02-20 + +### πŸ”’ Security Fix: SSRF protection in setup wizard + +- **Fixed:** `setup.py` SearXNG connection test had no SSRF protection (unlike `search.py`) +- **Before:** Operator could be tricked into probing internal networks during setup +- **After:** Same IP validation as `search.py` β€” blocks private IPs, cloud metadata, loopback +- **Credit:** ClawHub security scanner + +## [2.8.3] - 2026-02-20 + +### πŸ› Critical Fix: Perplexity results empty + +- **Fixed:** Perplexity provider returned 0 results because the AI-synthesized answer wasn't mapped into the results array +- **Before:** Only extracted URLs from the answer text were returned as results (often 0) +- **After:** The full answer is now the primary result (title, snippet with cleaned text), extracted source URLs follow as additional results +- **Impact:** Perplexity queries now always return at least 1 result with the synthesized answer + +## [2.8.0] - 2026-02-20 + +### πŸ†• New Provider: Perplexity (AI-Synthesized Answers) + +Added Perplexity as the 6th search provider via Kilo Gateway β€” the first provider that returns **direct answers with citations** instead of just links: + +#### Features +- **AI-Synthesized Answers**: Get a complete answer, not a list of links +- **Inline Citations**: Every claim backed by `[1][2][3]` source references +- **Real-Time Web Search**: Perplexity searches the web live, reads pages, and summarizes +- **Zero Extra Config**: Works through Kilo Gateway with your existing `KILOCODE_API_KEY` +- **Model**: `perplexity/sonar-pro` (best quality, supports complex queries) + +#### Auto-Routing Signals +New direct-answer intent detection routes to Perplexity for: +- Status queries: "status of", "current state of", "what is the status" +- Local info: "events in [city]", "things to do in", "what's happening in" +- Direct questions: "what is", "who is", "when did", "how many" +- Current affairs: "this week", "this weekend", "right now", "today" + +#### Usage Examples +```bash +# Auto-routed +python3 scripts/search.py -q "events in Graz Austria this weekend" # β†’ Perplexity +python3 scripts/search.py -q "what is the current status of Ethereum" # β†’ Perplexity + +# Explicit +python3 scripts/search.py -p perplexity -q "latest AI regulation news" +``` + +#### Configuration +Requires `KILOCODE_API_KEY` environment variable (Kilo Gateway account). +No additional API key needed β€” Perplexity is accessed through Kilo's unified API. + +```bash +export KILOCODE_API_KEY="your-kilo-key" +``` + +### πŸ”§ Routing Rebalance + +Major overhaul of the auto-routing confidence scoring to fix Serper dominance: + +#### Problem +Serper (Google) was winning ~90% of queries due to: +- High recency multiplier boosting Serper on any query with dates/years +- Default provider priority placing Serper first in ties +- Research and discovery signals not strong enough to override + +#### Changes +- **Lowered Serper recency multiplier** β€” date mentions no longer auto-route to Google +- **Strengthened research signals** for Tavily: + - Added: "status of", "what happened with", "how does X compare" + - Boosted weights for comparison patterns (4.0 β†’ 5.0) +- **Strengthened discovery signals** for Exa: + - Added: "events in", "things to do in", "startups similar to" + - Boosted weights for local discovery patterns +- **Updated provider priority order**: `tavily β†’ exa β†’ perplexity β†’ serper β†’ you β†’ searxng` + - Serper moved from 1st to 4th in tie-breaking + - Research/discovery providers now win on ambiguous queries + +#### Routing Test Results + +| Query | Before | After | βœ“ | +|-------|--------|-------|---| +| "latest OpenClaw version Feb 2026" | Serper | Serper | βœ… | +| "Ethereum Pectra upgrade status" | Serper | **Tavily** | βœ… | +| "events in Graz this weekend" | Serper | **Perplexity** | βœ… | +| "compare SearXNG vs Brave for AI agents" | Serper | **Tavily** | βœ… | +| "Sam Altman OpenAI news this week" | Serper | Serper | βœ… | +| "find startups similar to Kilo Code" | Serper | **Exa** | βœ… | + +### πŸ“Š Updated Provider Comparison + +| Feature | Serper | Tavily | Exa | Perplexity | You.com | SearXNG | +|---------|:------:|:------:|:---:|:----------:|:-------:|:-------:| +| Speed | ⚑⚑⚑ | ⚑⚑ | ⚑⚑ | ⚑⚑ | ⚑⚑⚑ | ⚑ | +| Direct Answers | βœ— | βœ— | βœ— | βœ“βœ“ | βœ— | βœ— | +| Citations | βœ— | βœ— | βœ— | βœ“ | βœ— | βœ— | +| Local Events | βœ“ | βœ— | βœ“ | βœ“βœ“ | βœ— | βœ“ | +| Research | βœ— | βœ“βœ“ | βœ“ | βœ“ | βœ“ | βœ— | +| Discovery | βœ— | βœ— | βœ“βœ“ | βœ— | βœ— | βœ— | +| Self-Hosted | βœ— | βœ— | βœ— | βœ— | βœ— | βœ“ | + +## [2.7.0] - 2026-02-14 + +### ✨ Added +- Provider cooldown tracking in `.cache/provider_health.json` +- Exponential cooldown on provider failures: **1m β†’ 5m β†’ 25m β†’ 1h (cap)** +- Retry strategy for transient failures (timeout, 429, 503): up to 2 retries with backoff **1s β†’ 3s β†’ 9s** +- Smarter cache keys hashed from full request context (query/provider/max_results + locale, freshness, time_range, topic, search_engines, include_news, and related params) +- Cross-provider result deduplication by normalized URL during fallback merge + +### πŸ”§ Changed +- Cooldown providers are skipped in routing while their cooldown is active +- Provider health is reset automatically after successful requests +- Fallback output now includes dedup metadata: + - `deduplicated: true|false` + - `metadata.dedup_count` + + +## [2.6.5] - 2026-02-11 + +### πŸ†• File-Based Result Caching + +Added local caching to save API costs on repeated searches: + +#### Features +- **Automatic Caching**: Search results cached locally by default +- **1-Hour TTL**: Results expire after 3600 seconds (configurable) +- **Cache Indicators**: Response includes `cached: true/false` and `cache_age_seconds` +- **Zero-Cost Repeats**: Cached requests don't hit APIs + +#### New CLI Options +- `--cache-ttl SECONDS` β€” Custom cache TTL (default: 3600) +- `--no-cache` β€” Bypass cache, always fetch fresh +- `--clear-cache` β€” Delete all cached results +- `--cache-stats` β€” Show cache statistics (entries, size, age) + +#### Configuration +- **Cache directory**: `.cache/` in skill directory +- **Environment variable**: `WSP_CACHE_DIR` to override location +- **Cache key**: Based on query + provider + max_results (SHA256) + +#### Usage Examples +```bash +# First request costs API credits +python3 scripts/search.py -q "AI startups" + +# Second request is FREE (uses cache) +python3 scripts/search.py -q "AI startups" + +# Force fresh results +python3 scripts/search.py -q "AI startups" --no-cache + +# View stats +python3 scripts/search.py --cache-stats + +# Clear everything +python3 scripts/search.py --clear-cache +``` + +#### Technical Details +- Cache files: JSON with metadata (_cache_timestamp, _cache_key, etc.) +- Automatic cleanup of expired entries on access +- Graceful handling of corrupted cache files + +## [2.6.1] - 2026-02-04 + +- Privacy cleanup: removed hardcoded paths and personal info from docs + +## [2.5.0] - 2026-02-03 + +### πŸ†• New Provider: SearXNG (Privacy-First Meta-Search) + +Added SearXNG as the 5th search provider, focused on privacy and self-hosted search: + +#### Features +- **Privacy-Preserving**: No tracking, no profiling β€” your searches stay private +- **Multi-Source Aggregation**: Queries 70+ upstream engines (Google, Bing, DuckDuckGo, etc.) +- **$0 API Cost**: Self-hosted = unlimited queries with no API fees +- **Diverse Results**: Get perspectives from multiple search engines in one query +- **Customizable**: Choose which engines to use, set SafeSearch levels, language preferences + +#### Auto-Routing Signals +New privacy/multi-source intent detection routes to SearXNG for: +- Privacy queries: "private", "anonymous", "without tracking", "no tracking" +- Multi-source: "aggregate results", "multiple sources", "diverse perspectives" +- Budget/free: "free search", "no api cost", "self-hosted search" +- German: "privat", "anonym", "ohne tracking", "verschiedene quellen" + +#### Usage Examples +```bash +# Auto-routed +python3 scripts/search.py -q "search privately without tracking" # β†’ SearXNG + +# Explicit +python3 scripts/search.py -p searxng -q "linux distros" +python3 scripts/search.py -p searxng -q "AI news" --engines "google,bing,duckduckgo" +python3 scripts/search.py -p searxng -q "privacy tools" --searxng-safesearch 2 +``` + +#### Configuration +```json +{ + "searxng": { + "instance_url": "https://your-instance.example.com", + "safesearch": 0, + "engines": null, + "language": "en" + } +} +``` + +#### Setup +SearXNG requires a self-hosted instance with JSON format enabled: +```bash +# Docker setup (5 minutes) +docker run -d -p 8080:8080 searxng/searxng + +# Enable JSON in settings.yml: +# search: +# formats: [html, json] + +# Set instance URL +export SEARXNG_INSTANCE_URL="http://localhost:8080" +``` + +See: https://docs.searxng.org/admin/installation.html + +### πŸ“Š Updated Provider Comparison + +| Feature | Serper | Tavily | Exa | You.com | SearXNG | +|---------|:------:|:------:|:---:|:-------:|:-------:| +| Privacy-First | βœ— | βœ— | βœ— | βœ— | βœ“βœ“ | +| Self-Hosted | βœ— | βœ— | βœ— | βœ— | βœ“ | +| API Cost | $$ | $$ | $$ | $ | **FREE** | +| Multi-Engine | βœ— | βœ— | βœ— | βœ— | βœ“ (70+) | + +### πŸ”§ Technical Changes + +- Added `search_searxng()` function with full error handling +- Added `PRIVACY_SIGNALS` to QueryAnalyzer for auto-routing +- Updated setup wizard with SearXNG option (instance URL validation) +- Updated config.example.json with searxng section +- New CLI args: `--searxng-url`, `--searxng-safesearch`, `--engines`, `--categories` + +--- + +## [2.4.4] - 2026-02-03 + +### πŸ“ Documentation: Provider Count Fix + +- **Fixed:** "You can use 1, 2, or all 3" β†’ "1, 2, 3, or all 4" (we have 4 providers now!) +- **Impact:** Accurate documentation for setup wizard + +## [2.4.3] - 2026-02-03 + +### πŸ“ Documentation: Updated README + +- **Added:** "NEW in v2.4.2" badge for You.com in SKILL.md +- **Impact:** ClawHub README now properly highlights You.com as new feature + +## [2.4.2] - 2026-02-03 + +### πŸ› Critical Fix: You.com API Configuration + +- **Fixed:** Incorrect hostname (`api.ydc-index.io` β†’ `ydc-index.io`) +- **Fixed:** Incorrect header name (`X-API-Key` β†’ `X-API-KEY` uppercase) +- **Impact:** You.com now works correctly - was giving 403 Forbidden before +- **Status:** βœ… Fully tested and working + +## [2.4.1] - 2026-02-03 + +### πŸ› Bugfix: You.com URL Encoding + +- **Fixed:** URL encoding for You.com queries - spaces and special characters now properly encoded +- **Impact:** Queries with spaces (e.g., "OpenClaw AI framework") work correctly now +- **Technical:** Added `urllib.parse.quote` for parameter encoding + +## [2.4.0] - 2026-02-03 + +### πŸ†• New Provider: You.com + +Added You.com as the 4th search provider, optimized for RAG applications and real-time information: + +#### Features +- **LLM-Ready Snippets**: Pre-extracted, query-aware text excerpts perfect for feeding into AI models +- **Unified Web + News**: Get both web pages and news articles in a single API call +- **Live Crawling**: Fetch full page content on-demand in Markdown format (`--livecrawl`) +- **Automatic News Classification**: Intelligently includes news results based on query intent +- **Freshness Controls**: Filter by recency (day, week, month, year, or date range) +- **SafeSearch Support**: Content filtering (off, moderate, strict) + +#### Auto-Routing Signals +New RAG/Real-time intent detection routes to You.com for: +- RAG context queries: "summarize", "key points", "tldr", "context for" +- Real-time info: "latest news", "current status", "right now", "what's happening" +- Information synthesis: "updates on", "situation", "main takeaways" + +#### Usage Examples +```bash +# Auto-routed +python3 scripts/search.py -q "summarize key points about AI regulation" # β†’ You.com + +# Explicit +python3 scripts/search.py -p you -q "climate change" --livecrawl all +python3 scripts/search.py -p you -q "tech news" --freshness week +``` + +#### Configuration +```json +{ + "you": { + "country": "US", + "language": "en", + "safesearch": "moderate", + "include_news": true + } +} +``` + +#### API Key Setup +```bash +export YOU_API_KEY="your-key" # Get from https://api.you.com +``` + +### πŸ“Š Updated Provider Comparison + +| Feature | Serper | Tavily | Exa | You.com | +|---------|:------:|:------:|:---:|:-------:| +| Speed | ⚑⚑⚑ | ⚑⚑ | ⚑⚑ | ⚑⚑⚑ | +| News Integration | βœ“ | βœ— | βœ— | βœ“ | +| RAG-Optimized | βœ— | βœ“ | βœ— | βœ“βœ“ | +| Full Page Content | βœ— | βœ“ | βœ“ | βœ“ | + +--- + +## [2.1.5] - 2026-01-27 + +### πŸ“ Documentation + +- Added warning about NOT using Tavily/Serper/Exa in core OpenClaw config +- Core OpenClaw only supports `brave` as the built-in provider +- This skill's providers must be used via environment variables and scripts, not `openclaw.json` + +## [2.1.0] - 2026-01-23 + +### 🧠 Intelligent Multi-Signal Routing + +Completely overhauled auto-routing with sophisticated query analysis: + +#### Intent Classification +- **Shopping Intent**: Detects price patterns ("how much", "cost of"), purchase signals ("buy", "order"), deal keywords, and product+brand combinations +- **Research Intent**: Identifies explanation patterns ("how does", "why does"), analysis signals ("pros and cons", "compare"), learning keywords, and complex multi-clause queries +- **Discovery Intent**: Recognizes similarity patterns ("similar to", "alternatives"), company discovery signals, URL/domain detection, and academic patterns + +#### Linguistic Pattern Detection +- "How much" / "price of" β†’ Shopping (Serper) +- "How does" / "Why does" / "Explain" β†’ Research (Tavily) +- "Companies like" / "Similar to" / "Alternatives" β†’ Discovery (Exa) +- Product + Brand name combos β†’ Shopping (Serper) +- URLs and domains in query β†’ Similar search (Exa) + +#### Query Analysis Features +- **Complexity scoring**: Long, multi-clause queries get routed to research providers +- **URL detection**: Automatic detection of URLs/domains triggers Exa similar search +- **Brand recognition**: Tech brands (Apple, Samsung, Sony, etc.) with product terms β†’ shopping +- **Recency signals**: "latest", "2026", "breaking" boost news mode + +#### Confidence Scoring +- **HIGH (70-100%)**: Strong signal match, very reliable routing +- **MEDIUM (40-69%)**: Good match, should work well +- **LOW (0-39%)**: Ambiguous query, using fallback provider +- Confidence based on absolute signal strength + relative margin over alternatives + +#### Enhanced Debug Mode +```bash +python3 scripts/search.py --explain-routing -q "your query" +``` + +Now shows: +- Routing decision with confidence level +- All provider scores +- Top matched signals with weights +- Query analysis (complexity, URL detection, recency focus) +- All matched patterns per provider + +### πŸ”§ Technical Changes + +#### QueryAnalyzer Class +New `QueryAnalyzer` class with: +- `SHOPPING_SIGNALS`: 25+ weighted patterns for shopping intent +- `RESEARCH_SIGNALS`: 30+ weighted patterns for research intent +- `DISCOVERY_SIGNALS`: 20+ weighted patterns for discovery intent +- `LOCAL_NEWS_SIGNALS`: 25+ patterns for local/news queries +- `BRAND_PATTERNS`: Tech brand detection regex + +#### Signal Weighting +- Multi-word phrases get higher weights (e.g., "how much" = 4.0 vs "price" = 3.0) +- Strong signals: price patterns (4.0), similarity patterns (5.0), URLs (5.0) +- Medium signals: product terms (2.5), learning keywords (2.5) +- Bonus scoring: Product+brand combo (+3.0), complex query (+2.5) + +#### Improved Output Format +```json +{ + "routing": { + "auto_routed": true, + "provider": "serper", + "confidence": 0.78, + "confidence_level": "high", + "reason": "high_confidence_match", + "top_signals": [{"matched": "price", "weight": 3.0}], + "scores": {"serper": 7.0, "tavily": 0.0, "exa": 0.0} + } +} +``` + +### πŸ“š Documentation Updates + +- **SKILL.md**: Complete rewrite with signal tables and confidence scoring guide +- **README.md**: Updated with intelligent routing examples and confidence levels +- **FAQ**: Updated to explain multi-signal analysis + +### πŸ§ͺ Test Results + +| Query | Provider | Confidence | Signals | +|-------|----------|------------|---------| +| "how much does iPhone 16 cost" | Serper | 68% | "how much", brand+product | +| "how does quantum entanglement work" | Tavily | 86% HIGH | "how does", "what are", "implications" | +| "startups similar to Notion" | Exa | 76% HIGH | "similar to", "Series A" | +| "companies like stripe.com" | Exa | 100% HIGH | URL detected, "companies like" | +| "MacBook Pro M3 specs review" | Serper | 70% HIGH | brand+product, "specs", "review" | +| "Tesla" | Serper | 0% LOW | No signals (fallback) | +| "arxiv papers on transformers" | Exa | 58% | "arxiv" | +| "latest AI news 2026" | Serper | 77% HIGH | "latest", "news", "2026" | + +--- + +## [2.0.0] - 2026-01-23 + +### πŸŽ‰ Major Features + +#### Smart Auto-Routing +- **Automatic provider selection** based on query analysis +- No need to manually choose provider - just search! +- Intelligent keyword matching for routing decisions +- Pattern detection for query types (shopping, research, discovery) +- Scoring system for provider selection + +#### User Configuration +- **config.json**: Full control over auto-routing behavior +- **Configurable keyword mappings**: Add your own routing keywords +- **Provider priority**: Set tie-breaker order +- **Disable providers**: Turn off providers you don't have API keys for +- **Enable/disable auto-routing**: Opt-in or opt-out as needed + +#### Debugging Tools +- **--explain-routing** flag: See exactly why a provider was selected +- Detailed routing metadata in JSON responses +- Shows matched keywords and routing scores + +### πŸ“š Documentation + +- **README.md**: Complete auto-routing guide with examples +- **SKILL.md**: Detailed routing logic and configuration reference +- **FAQ section**: Common questions about auto-routing +- **Configuration examples**: Pre-built configs for common use cases + +--- + +## [1.0.x] - Initial Release + +- Multi-provider search: Serper, Tavily, Exa +- Manual provider selection with `-p` flag +- Unified JSON output format +- Provider-specific options (--depth, --category, --similar-url, etc.) +- Domain filtering for Tavily/Exa +- Date filtering for Exa diff --git a/FAQ.md b/FAQ.md new file mode 100644 index 0000000..cac96ec --- /dev/null +++ b/FAQ.md @@ -0,0 +1,263 @@ +# Frequently Asked Questions + +## Caching (NEW in v2.7.0!) + +### How does caching work? +Search results are automatically cached locally for 1 hour (3600 seconds). When you make the same query again, you get instant results at $0 API cost. The cache key is based on: query text + provider + max_results. + +### Where are cached results stored? +In `.cache/` directory inside the skill folder by default. Override with `WSP_CACHE_DIR` environment variable: +```bash +export WSP_CACHE_DIR="/path/to/custom/cache" +``` + +### How do I see cache stats? +```bash +python3 scripts/search.py --cache-stats +``` +This shows total entries, size, oldest/newest entries, and breakdown by provider. + +### How do I clear the cache? +```bash +python3 scripts/search.py --clear-cache +``` + +### Can I change the cache TTL? +Yes! Default is 3600 seconds (1 hour). Set a custom TTL per request: +```bash +python3 scripts/search.py -q "query" --cache-ttl 7200 # 2 hours +``` + +### How do I skip the cache? +Use `--no-cache` to always fetch fresh results: +```bash +python3 scripts/search.py -q "query" --no-cache +``` + +### How do I know if a result was cached? +The response includes: +- `"cached": true/false` β€” whether result came from cache +- `"cache_age_seconds": 1234` β€” how old the cached result is (when cached) + +--- + +## General + +### How does auto-routing decide which provider to use? +Multi-signal analysis scores each provider based on: price patterns, explanation phrases, similarity keywords, URLs, product+brand combos, and query complexity. Highest score wins. Use `--explain-routing` to see the decision breakdown. + +### What if it picks the wrong provider? +Override with `-p serper/tavily/exa`. Check `--explain-routing` to understand why it chose differently. + +### What does "low confidence" mean? +Query is ambiguous (e.g., "Tesla" could be cars, stock, or company). Falls back to Serper. Results may vary. + +### Can I disable a provider? +Yes! In config.json: `"disabled_providers": ["exa"]` + +--- + +## API Keys + +### Which API keys do I need? +At minimum ONE key (or SearXNG instance). You can use just Serper, just Tavily, just Exa, just You.com, or just SearXNG. Missing keys = that provider is skipped. + +### Where do I get API keys? +- Serper: https://serper.dev (2,500 free queries, no credit card) +- Tavily: https://tavily.com (1,000 free searches/month) +- Exa: https://exa.ai (1,000 free searches/month) +- You.com: https://api.you.com (Limited free tier for testing) +- SearXNG: Self-hosted, no key needed! https://docs.searxng.org/admin/installation.html + +### How do I set API keys? +Two options (both auto-load): + +**Option A: .env file** +```bash +export SERPER_API_KEY="your-key" +``` + +**Option B: config.json** (v2.2.1+) +```json +{ "serper": { "api_key": "your-key" } } +``` + +--- + +## Routing Details + +### How do I know which provider handled my search? +Check `routing.provider` in JSON output, or `[πŸ” Searched with: Provider]` in chat responses. + +### Why does it sometimes choose Serper for research questions? +If the query has brand/product signals (e.g., "how does Tesla FSD work"), shopping intent may outweigh research intent. Override with `-p tavily`. + +### What's the confidence threshold? +Default: 0.3 (30%). Below this = low confidence, uses fallback. Adjustable in config.json. + +--- + +## You.com Specific + +### When should I use You.com over other providers? +You.com excels at: +- **RAG applications**: Pre-extracted snippets ready for LLM consumption +- **Real-time information**: Current events, breaking news, status updates +- **Combined sources**: Web + news results in a single API call +- **Summarization tasks**: "What's the latest on...", "Key points about..." + +### What's the livecrawl feature? +You.com can fetch full page content on-demand. Use `--livecrawl web` for web results, `--livecrawl news` for news articles, or `--livecrawl all` for both. Content is returned in Markdown format. + +### Does You.com include news automatically? +Yes! You.com's intelligent classification automatically includes relevant news results when your query has news intent. You can also use `--include-news` to explicitly enable it. + +--- + +## SearXNG Specific + +### Do I need my own SearXNG instance? +Yes! SearXNG is self-hosted. Most public instances disable the JSON API to prevent bot abuse. You need to run your own instance with JSON format enabled. See: https://docs.searxng.org/admin/installation.html + +### How do I set up SearXNG? +Docker is the easiest way: +```bash +docker run -d -p 8080:8080 searxng/searxng +``` +Then enable JSON in `settings.yml`: +```yaml +search: + formats: + - html + - json +``` + +### Why am I getting "403 Forbidden"? +The JSON API is disabled on your instance. Enable it in `settings.yml` under `search.formats`. + +### What's the API cost for SearXNG? +**$0!** SearXNG is free and open-source. You only pay for hosting (~$5/month VPS). Unlimited queries. + +### When should I use SearXNG? +- **Privacy-sensitive queries**: No tracking, no profiling +- **Budget-conscious**: $0 API cost +- **Diverse results**: Aggregates 70+ search engines +- **Self-hosted requirements**: Full control over your search infrastructure +- **Fallback provider**: When paid APIs are rate-limited + +### Can I limit which search engines SearXNG uses? +Yes! Use `--engines google,bing,duckduckgo` to specify engines, or configure defaults in `config.json`. + +--- + +## Provider Selection + +### Which provider should I use? + +| Query Type | Best Provider | Why | +|------------|---------------|-----| +| **Shopping** ("buy laptop", "cheap shoes") | **Serper** | Google Shopping, price comparisons, local stores | +| **Research** ("how does X work?", "explain Y") | **Tavily** | Deep research, academic quality, full-page content | +| **Startups/Papers** ("companies like X", "arxiv papers") | **Exa** | Semantic/neural search, startup discovery | +| **RAG/Real-time** ("summarize latest", "current events") | **You.com** | LLM-ready snippets, combined web+news | +| **Privacy** ("search without tracking") | **SearXNG** | No tracking, multi-source, self-hosted | + +**Tip:** Enable auto-routing and let the skill choose automatically! 🎯 + +### Do I need all 5 providers? +**No!** All providers are optional. You can use: +- **1 provider** (e.g., just Serper for everything) +- **2-3 providers** (e.g., Serper + You.com for most needs) +- **All 5** (maximum flexibility + fallback options) + +### How much do the APIs cost? + +| Provider | Free Tier | Paid Plan | +|----------|-----------|-----------| +| **Serper** | 2,500 queries/mo | $50/mo (5,000 queries) | +| **Tavily** | 1,000 queries/mo | $150/mo (10,000 queries) | +| **Exa** | 1,000 queries/mo | $1,000/mo (100,000 queries) | +| **You.com** | Limited free | ~$10/mo (varies by usage) | +| **SearXNG** | **FREE** βœ… | Only VPS cost (~$5/mo if self-hosting) | + +**Budget tip:** Use SearXNG as primary + others as fallback for specialized queries! + +### How private is SearXNG really? + +| Setup | Privacy Level | +|-------|---------------| +| **Self-hosted (your VPS)** | ⭐⭐⭐⭐⭐ You control everything | +| **Self-hosted (Docker local)** | ⭐⭐⭐⭐⭐ Fully private | +| **Public instance** | ⭐⭐⭐ Depends on operator's logging policy | + +**Best practice:** Self-host if privacy is critical. + +### Which provider has the best results? + +| Metric | Winner | +|--------|--------| +| **Most accurate for facts** | Serper (Google) | +| **Best for research depth** | Tavily | +| **Best for semantic queries** | Exa | +| **Best for RAG/AI context** | You.com | +| **Most diverse sources** | SearXNG (70+ engines) | +| **Most private** | SearXNG (self-hosted) | + +**Recommendation:** Enable multiple providers + auto-routing for best overall experience. + +### How does auto-routing work? +The skill analyzes your query for keywords and patterns: + +```python +"buy cheap laptop" β†’ Serper (shopping signals) +"how does AI work?" β†’ Tavily (research/explanation) +"companies like X" β†’ Exa (semantic/similar) +"summarize latest news" β†’ You.com (RAG/real-time) +"search privately" β†’ SearXNG (privacy signals) +``` + +**Confidence threshold:** Only routes if confidence > 30%. Otherwise uses default provider. + +**Override:** Use `-p provider` to force a specific provider. + +--- + +## Production Use + +### Can I use this in production? +**Yes!** Web-search-plus is production-ready: +- βœ… Error handling with automatic fallback +- βœ… Rate limit protection +- βœ… Timeout handling (30s per provider) +- βœ… API key security (.env + config.json gitignored) +- βœ… 5 providers for redundancy + +**Tip:** Monitor API usage to avoid exceeding free tiers! + +### What if I run out of API credits? +1. **Fallback chain:** Other enabled providers automatically take over +2. **Use SearXNG:** Switch to self-hosted (unlimited queries) +3. **Upgrade plan:** Paid tiers have higher limits +4. **Rate limit:** Use `disabled_providers` to skip exhausted APIs temporarily + +--- + +## Updates + +### How do I update to the latest version? + +**Via ClawHub (recommended):** +```bash +clawhub update web-search-plus --registry "https://www.clawhub.ai" --no-input +``` + +**Manually:** +```bash +cd /path/to/workspace/skills/web-search-plus/ +git pull origin main +python3 scripts/setup.py # Re-run to configure new features +``` + +### Where can I report bugs or request features? +- **GitHub Issues:** https://github.com/robbyczgw-cla/web-search-plus/issues +- **ClawHub:** https://www.clawhub.ai/skills/web-search-plus diff --git a/README.md b/README.md new file mode 100644 index 0000000..a357870 --- /dev/null +++ b/README.md @@ -0,0 +1,800 @@ +# Web Search Plus + +> Unified multi-provider web search with **Intelligent Auto-Routing** β€” uses multi-signal analysis to automatically select between **Serper**, **Tavily**, **Querit**, **Exa**, **Perplexity (Sonar Pro)**, **You.com**, and **SearXNG** with confidence scoring. + +[![ClawHub](https://img.shields.io/badge/ClawHub-web--search--plus-blue)](https://clawhub.ai) +[![Version](https://img.shields.io/badge/version-2.9.0-green)](https://clawhub.ai) +[![GitHub](https://img.shields.io/badge/GitHub-web--search--plus-blue)](https://github.com/robbyczgw-cla/web-search-plus) + +--- + +## 🧠 Features (v2.9.0) + +**Intelligent Multi-Signal Routing** β€” The skill uses sophisticated query analysis: + +- **Intent Classification**: Shopping vs Research vs Discovery vs RAG/Real-time vs Privacy +- **Linguistic Patterns**: "how much" (price) vs "how does" (research) vs "privately" (privacy) +- **Entity Detection**: Product+brand combos, URLs, domains +- **Complexity Analysis**: Long queries favor research providers +- **Confidence Scoring**: Know how reliable the routing decision is + +```bash +python3 scripts/search.py -q "how much does iPhone 16 cost" # β†’ Serper (68% confidence) +python3 scripts/search.py -q "how does quantum entanglement work" # β†’ Tavily (86% HIGH) +python3 scripts/search.py -q "startups similar to Notion" # β†’ Exa (76% HIGH) +python3 scripts/search.py -q "companies like stripe.com" # β†’ Exa (100% HIGH - URL detected) +python3 scripts/search.py -q "summarize key points on AI" # β†’ You.com (68% MEDIUM - RAG intent) +python3 scripts/search.py -q "search privately without tracking" # β†’ SearXNG (74% HIGH - privacy intent) +``` + +--- + +## πŸ” When to Use Which Provider + +### Built-in Brave Search (OpenClaw default) +- βœ… General web searches +- βœ… Privacy-focused +- βœ… Quick lookups +- βœ… Default fallback + +### Serper (Google Results) +- πŸ› **Product specs, prices, shopping** +- πŸ“ **Local businesses, places** +- 🎯 **"Google it" - explicit Google results** +- πŸ“° **Shopping/images needed** +- πŸ† **Knowledge Graph data** + +### Tavily (AI-Optimized Research) +- πŸ“š **Research questions, deep dives** +- πŸ”¬ **Complex multi-part queries** +- πŸ“„ **Need full page content** (not just snippets) +- πŸŽ“ **Academic/technical research** +- πŸ”’ **Domain filtering** (trusted sources) + +### Querit (Multilingual AI Search) +- 🌏 **Multilingual AI search** across 10+ languages +- ⚑ **Fast real-time answers** with ~400ms latency +- πŸ—ΊοΈ **International / cross-language queries** +- πŸ“° **Recency-aware results** for current information +- πŸ€– **Good fit for AI workflows** with clean metadata + +### Exa (Neural Semantic Search) +- πŸ”— **Find similar pages** +- 🏒 **Company/startup discovery** +- πŸ“ **Research papers** +- πŸ’» **GitHub projects** +- πŸ“… **Date-specific content** + +### Perplexity (Sonar Pro via Kilo Gateway) +- ⚑ **Direct answers** (great for β€œwho/what/define”) +- 🧾 **Cited, answer-first output** +- πŸ•’ **Current events / β€œas of” questions** +- πŸ”‘ Auth via `KILOCODE_API_KEY` (routes to `https://api.kilo.ai`) + +### You.com (RAG/Real-time) +- πŸ€– **RAG applications** (LLM-ready snippets) +- πŸ“° **Combined web + news** (single API call) +- ⚑ **Real-time information** (current events) +- πŸ“‹ **Summarization context** ("What's the latest...") +- πŸ”„ **Live crawling** (full page content on demand) + +### SearXNG (Privacy-First/Self-Hosted) +- πŸ”’ **Privacy-preserving search** (no tracking) +- 🌐 **Multi-source aggregation** (70+ engines) +- πŸ’° **$0 API cost** (self-hosted) +- 🎯 **Diverse perspectives** (results from multiple engines) +- 🏠 **Self-hosted environments** (full control) + +--- + +## Table of Contents + +- [Quick Start](#quick-start) +- [Smart Auto-Routing](#smart-auto-routing) +- [Configuration Guide](#configuration-guide) +- [Provider Deep Dives](#provider-deep-dives) +- [Usage Examples](#usage-examples) +- [Workflow Examples](#workflow-examples) +- [Optimization Tips](#optimization-tips) +- [FAQ & Troubleshooting](#faq--troubleshooting) +- [API Reference](#api-reference) + +--- + +## Quick Start + +### Option A: Interactive Setup (Recommended) + +```bash +# Run the setup wizard - it guides you through everything +python3 scripts/setup.py +``` + +The wizard explains each provider, collects your API keys, and creates `config.json` automatically. + +### Option B: Manual Setup + +```bash +# 1. Set up at least one API key (or SearXNG instance) +export SERPER_API_KEY="your-key" # https://serper.dev +export TAVILY_API_KEY="your-key" # https://tavily.com +export QUERIT_API_KEY="your-key" # https://querit.ai +export EXA_API_KEY="your-key" # https://exa.ai +export KILOCODE_API_KEY="your-key" # enables Perplexity Sonar Pro via https://api.kilo.ai +export YOU_API_KEY="your-key" # https://api.you.com +export SEARXNG_INSTANCE_URL="https://your-instance.example.com" # Self-hosted + +# 2. Run a search (auto-routed!) +python3 scripts/search.py -q "best laptop 2024" +``` + +### Run a Search + +```bash +# Auto-routed to best provider +python3 scripts/search.py -q "best laptop 2024" + +# Or specify a provider explicitly +python3 scripts/search.py -p serper -q "iPhone 16 specs" +python3 scripts/search.py -p tavily -q "quantum computing explained" --depth advanced +python3 scripts/search.py -p querit -q "latest AI policy updates in Germany" +python3 scripts/search.py -p exa -q "AI startups 2024" --category company +python3 scripts/search.py -p perplexity -q "Who is the president of Austria?" +``` + +--- + +## Smart Auto-Routing + +### How It Works + +When you don't specify a provider, the skill analyzes your query and routes it to the best provider: + +| Query Contains | Routes To | Example | +|---------------|-----------|---------| +| "price", "buy", "shop", "cost" | **Serper** | "iPhone 16 price" | +| "near me", "restaurant", "hotel" | **Serper** | "pizza near me" | +| "weather", "news", "latest" | **Serper** | "weather Berlin" | +| "how does", "explain", "what is" | **Tavily** | "how does TCP work" | +| "research", "study", "analyze" | **Tavily** | "climate research" | +| "tutorial", "guide", "learn" | **Tavily** | "python tutorial" | +| multilingual, current status, latest updates | **Querit** | "latest AI policy updates in Germany" | +| "similar to", "companies like" | **Exa** | "companies like Stripe" | +| "startup", "Series A" | **Exa** | "AI startups Series A" | +| "github", "research paper" | **Exa** | "LLM papers arxiv" | +| "private", "anonymous", "no tracking" | **SearXNG** | "search privately" | +| "multiple sources", "aggregate" | **SearXNG** | "results from all engines" | + +### Examples + +```bash +# These are all auto-routed to the optimal provider: +python3 scripts/search.py -q "MacBook Pro M3 price" # β†’ Serper +python3 scripts/search.py -q "how does HTTPS work" # β†’ Tavily +python3 scripts/search.py -q "latest AI policy updates in Germany" # β†’ Querit +python3 scripts/search.py -q "startups like Notion" # β†’ Exa +python3 scripts/search.py -q "best sushi restaurant near me" # β†’ Serper +python3 scripts/search.py -q "explain attention mechanism" # β†’ Tavily +python3 scripts/search.py -q "alternatives to Figma" # β†’ Exa +python3 scripts/search.py -q "search privately without tracking" # β†’ SearXNG +``` + +### Result Caching (introduced in v2.7.x) + +Search results are **automatically cached** for 1 hour to save API costs: + +```bash +# First request: fetches from API ($) +python3 scripts/search.py -q "AI startups 2024" + +# Second request: uses cache (FREE!) +python3 scripts/search.py -q "AI startups 2024" +# Output includes: "cached": true + +# Bypass cache (force fresh results) +python3 scripts/search.py -q "AI startups 2024" --no-cache + +# View cache stats +python3 scripts/search.py --cache-stats + +# Clear all cached results +python3 scripts/search.py --clear-cache + +# Custom TTL (in seconds, default: 3600 = 1 hour) +python3 scripts/search.py -q "query" --cache-ttl 7200 +``` + +**Cache location:** `.cache/` in skill directory (override with `WSP_CACHE_DIR` environment variable) + +### Debug Auto-Routing + +See exactly why a provider was selected: + +```bash +python3 scripts/search.py --explain-routing -q "best laptop to buy" +``` + +Output: +```json +{ + "query": "best laptop to buy", + "selected_provider": "serper", + "reason": "matched_keywords (score=2)", + "matched_keywords": ["buy", "best"], + "available_providers": ["serper", "tavily", "exa"] +} +``` + +### Routing Info in Results + +Every search result includes routing information: + +```json +{ + "provider": "serper", + "query": "iPhone 16 price", + "results": [...], + "routing": { + "auto_routed": true, + "selected_provider": "serper", + "reason": "matched_keywords (score=1)", + "matched_keywords": ["price"] + } +} +``` + +--- + +## Configuration Guide + +### Environment Variables + +Create a `.env` file or set these in your shell: + +```bash +# Required: Set at least one +export SERPER_API_KEY="your-serper-key" +export TAVILY_API_KEY="your-tavily-key" +export EXA_API_KEY="your-exa-key" +``` + +### Config File (config.json) + +The `config.json` file lets you customize auto-routing and provider defaults: + +```json +{ + "defaults": { + "provider": "serper", + "max_results": 5 + }, + + "auto_routing": { + "enabled": true, + "fallback_provider": "serper", + "provider_priority": ["serper", "tavily", "exa"], + "disabled_providers": [], + "keyword_mappings": { + "serper": ["price", "buy", "shop", "cost", "deal", "near me", "weather"], + "tavily": ["how does", "explain", "research", "what is", "tutorial"], + "exa": ["similar to", "companies like", "alternatives", "startup", "github"] + } + }, + + "serper": { + "country": "us", + "language": "en" + }, + + "tavily": { + "depth": "basic", + "topic": "general" + }, + + "exa": { + "type": "neural" + } +} +``` + +### Configuration Examples + +#### Example 1: Disable Exa (Only Use Serper + Tavily) + +```json +{ + "auto_routing": { + "disabled_providers": ["exa"] + } +} +``` + +#### Example 2: Make Tavily the Default + +```json +{ + "auto_routing": { + "fallback_provider": "tavily" + } +} +``` + +#### Example 3: Add Custom Keywords + +```json +{ + "auto_routing": { + "keyword_mappings": { + "serper": [ + "price", "buy", "shop", "amazon", "ebay", "walmart", + "deal", "discount", "coupon", "sale", "cheap" + ], + "tavily": [ + "how does", "explain", "research", "what is", + "coursera", "udemy", "learn", "course", "certification" + ], + "exa": [ + "similar to", "companies like", "competitors", + "YC company", "funded startup", "Series A", "Series B" + ] + } + } +} +``` + +#### Example 4: German Locale for Serper + +```json +{ + "serper": { + "country": "de", + "language": "de" + } +} +``` + +#### Example 5: Disable Auto-Routing + +```json +{ + "auto_routing": { + "enabled": false + }, + "defaults": { + "provider": "serper" + } +} +``` + +#### Example 6: Research-Heavy Config + +```json +{ + "auto_routing": { + "fallback_provider": "tavily", + "provider_priority": ["tavily", "serper", "exa"] + }, + "tavily": { + "depth": "advanced", + "include_raw_content": true + } +} +``` + +--- + +## Provider Deep Dives + +### Serper (Google Search API) + +**What it is:** Direct access to Google Search results via API β€” the same results you'd see on google.com. + +#### Strengths +| Strength | Description | +|----------|-------------| +| 🎯 **Accuracy** | Google's search quality, knowledge graph, featured snippets | +| πŸ›’ **Shopping** | Product prices, reviews, shopping results | +| πŸ“ **Local** | Business listings, maps, places | +| πŸ“° **News** | Real-time news with Google News integration | +| πŸ–Ό **Images** | Google Images search | +| ⚑ **Speed** | Fastest response times (~200-400ms) | + +#### Best Use Cases +- βœ… Product specifications and comparisons +- βœ… Shopping and price lookups +- βœ… Local business searches ("restaurants near me") +- βœ… Quick factual queries (weather, conversions, definitions) +- βœ… News headlines and current events +- βœ… Image searches +- βœ… When you need "what Google shows" + +#### Getting Your API Key +1. Go to [serper.dev](https://serper.dev) +2. Sign up with email or Google +3. Copy your API key from the dashboard +4. Set `SERPER_API_KEY` environment variable + +--- + +### Tavily (Research Search) + +**What it is:** AI-optimized search engine built for research and RAG applications β€” returns synthesized answers plus full content. + +#### Strengths +| Strength | Description | +|----------|-------------| +| πŸ“š **Research Quality** | Optimized for comprehensive, accurate research | +| πŸ’¬ **AI Answers** | Returns synthesized answers, not just links | +| πŸ“„ **Full Content** | Can return complete page content (raw_content) | +| 🎯 **Domain Filtering** | Include/exclude specific domains | +| πŸ”¬ **Deep Mode** | Advanced search for thorough research | +| πŸ“° **Topic Modes** | Specialized for general vs news content | + +#### Best Use Cases +- βœ… Research questions requiring synthesized answers +- βœ… Academic or technical deep dives +- βœ… When you need actual page content (not just snippets) +- βœ… Multi-source information comparison +- βœ… Domain-specific research (filter to authoritative sources) +- βœ… News research with context +- βœ… RAG/LLM applications + +#### Getting Your API Key +1. Go to [tavily.com](https://tavily.com) +2. Sign up and verify email +3. Navigate to API Keys section +4. Generate and copy your key +5. Set `TAVILY_API_KEY` environment variable + +--- + +### Exa (Neural Search) + +**What it is:** Neural/semantic search engine that understands meaning, not just keywords β€” finds conceptually similar content. + +#### Strengths +| Strength | Description | +|----------|-------------| +| 🧠 **Semantic Understanding** | Finds results by meaning, not keywords | +| πŸ”— **Similar Pages** | Find pages similar to a reference URL | +| 🏒 **Company Discovery** | Excellent for finding startups, companies | +| πŸ“‘ **Category Filters** | Filter by type (company, paper, tweet, etc.) | +| πŸ“… **Date Filtering** | Precise date range searches | +| πŸŽ“ **Academic** | Great for research papers and technical content | + +#### Best Use Cases +- βœ… Conceptual queries ("companies building X") +- βœ… Finding similar companies or pages +- βœ… Startup and company discovery +- βœ… Research paper discovery +- βœ… Finding GitHub projects +- βœ… Date-filtered searches for recent content +- βœ… When keyword matching fails + +#### Getting Your API Key +1. Go to [exa.ai](https://exa.ai) +2. Sign up with email or Google +3. Navigate to API section in dashboard +4. Copy your API key +5. Set `EXA_API_KEY` environment variable + +--- + +### SearXNG (Privacy-First Meta-Search) + +**What it is:** Open-source, self-hosted meta-search engine that aggregates results from 70+ search engines without tracking. + +#### Strengths +| Strength | Description | +|----------|-------------| +| πŸ”’ **Privacy-First** | No tracking, no profiling, no data collection | +| 🌐 **Multi-Engine** | Aggregates Google, Bing, DuckDuckGo, and 70+ more | +| πŸ’° **Free** | $0 API cost (self-hosted, unlimited queries) | +| 🎯 **Diverse Results** | Get perspectives from multiple search engines | +| βš™ **Customizable** | Choose which engines to use, SafeSearch, language | +| 🏠 **Self-Hosted** | Full control over your search infrastructure | + +#### Best Use Cases +- βœ… Privacy-sensitive searches (no tracking) +- βœ… When you want diverse results from multiple engines +- βœ… Budget-conscious (no API fees) +- βœ… Self-hosted/air-gapped environments +- βœ… Fallback when paid APIs are rate-limited +- βœ… When "aggregate everything" is the goal + +#### Setting Up Your Instance +```bash +# Docker (recommended, 5 minutes) +docker run -d -p 8080:8080 searxng/searxng + +# Enable JSON API in settings.yml: +# search: +# formats: [html, json] +``` + +1. See [docs.searxng.org](https://docs.searxng.org/admin/installation.html) +2. Deploy via Docker, pip, or your preferred method +3. Enable JSON format in `settings.yml` +4. Set `SEARXNG_INSTANCE_URL` environment variable + +--- + +## Usage Examples + +### Auto-Routed Searches (Recommended) + +```bash +# Just search β€” the skill picks the best provider +python3 scripts/search.py -q "Tesla Model 3 price" +python3 scripts/search.py -q "how do neural networks learn" +python3 scripts/search.py -q "YC startups like Stripe" +python3 scripts/search.py -q "search privately without tracking" +``` + +### Serper Options + +```bash +# Different search types +python3 scripts/search.py -p serper -q "gaming monitor" --type shopping +python3 scripts/search.py -p serper -q "coffee shop" --type places +python3 scripts/search.py -p serper -q "AI news" --type news + +# With time filter +python3 scripts/search.py -p serper -q "OpenAI news" --time-range day + +# Include images +python3 scripts/search.py -p serper -q "iPhone 16 Pro" --images + +# Different locale +python3 scripts/search.py -p serper -q "Wetter Wien" --country at --language de +``` + +### Tavily Options + +```bash +# Deep research mode +python3 scripts/search.py -p tavily -q "quantum computing applications" --depth advanced + +# With full page content +python3 scripts/search.py -p tavily -q "transformer architecture" --raw-content + +# Domain filtering +python3 scripts/search.py -p tavily -q "AI research" --include-domains arxiv.org nature.com +``` + +### Exa Options + +```bash +# Category filtering +python3 scripts/search.py -p exa -q "AI startups Series A" --category company +python3 scripts/search.py -p exa -q "attention mechanism" --category "research paper" + +# Date filtering +python3 scripts/search.py -p exa -q "YC companies" --start-date 2024-01-01 + +# Find similar pages +python3 scripts/search.py -p exa --similar-url "https://stripe.com" --category company +``` + +### SearXNG Options + +```bash +# Basic search +python3 scripts/search.py -p searxng -q "linux distros" + +# Specific engines only +python3 scripts/search.py -p searxng -q "AI news" --engines "google,bing,duckduckgo" + +# SafeSearch (0=off, 1=moderate, 2=strict) +python3 scripts/search.py -p searxng -q "privacy tools" --searxng-safesearch 2 + +# With time filter +python3 scripts/search.py -p searxng -q "open source projects" --time-range week + +# Custom instance URL +python3 scripts/search.py -p searxng -q "test" --searxng-url "http://localhost:8080" +``` + +--- + +## Workflow Examples + +### πŸ›’ Product Research Workflow + +```bash +# Step 1: Get product specs (auto-routed to Serper) +python3 scripts/search.py -q "MacBook Pro M3 Max specs" + +# Step 2: Check prices (auto-routed to Serper) +python3 scripts/search.py -q "MacBook Pro M3 Max price comparison" + +# Step 3: In-depth reviews (auto-routed to Tavily) +python3 scripts/search.py -q "detailed MacBook Pro M3 Max review" +``` + +### πŸ“š Academic Research Workflow + +```bash +# Step 1: Understand the topic (auto-routed to Tavily) +python3 scripts/search.py -q "explain transformer architecture in deep learning" + +# Step 2: Find recent papers (Exa) +python3 scripts/search.py -p exa -q "transformer improvements" --category "research paper" --start-date 2024-01-01 + +# Step 3: Find implementations (Exa) +python3 scripts/search.py -p exa -q "transformer implementation" --category github +``` + +### 🏒 Competitive Analysis Workflow + +```bash +# Step 1: Find competitors (auto-routed to Exa) +python3 scripts/search.py -q "companies like Notion" + +# Step 2: Find similar products (Exa) +python3 scripts/search.py -p exa --similar-url "https://notion.so" --category company + +# Step 3: Deep dive comparison (Tavily) +python3 scripts/search.py -p tavily -q "Notion vs Coda comparison" --depth advanced +``` + +--- + +## Optimization Tips + +### Cost Optimization + +| Tip | Savings | +|-----|---------| +| Use SearXNG for routine queries | **$0 API cost** | +| Use auto-routing (defaults to Serper, cheapest paid) | Best value | +| Use Tavily `basic` before `advanced` | ~50% cost reduction | +| Set appropriate `max_results` | Linear cost savings | +| Use Exa only for semantic queries | Avoid waste | + +### Performance Optimization + +| Tip | Impact | +|-----|--------| +| Serper is fastest (~200ms) | Use for time-sensitive queries | +| Tavily `basic` faster than `advanced` | ~2x faster | +| Lower `max_results` = faster response | Linear improvement | + +--- + +## FAQ & Troubleshooting + +### General Questions + +**Q: Do I need API keys for all three providers?** +> No. You only need keys for providers you want to use. Auto-routing skips providers without keys. + +**Q: Which provider should I start with?** +> Serper β€” it's the fastest, cheapest, and has the largest free tier (2,500 queries). + +**Q: Can I use multiple providers in one workflow?** +> Yes! That's the recommended approach. See [Workflow Examples](#workflow-examples). + +**Q: How do I reduce API costs?** +> Use auto-routing (defaults to cheapest), start with lower `max_results`, use Tavily `basic` before `advanced`. + +### Auto-Routing Questions + +**Q: Why did my query go to the wrong provider?** +> Use `--explain-routing` to debug. Add custom keywords to config.json if needed. + +**Q: Can I add my own keywords?** +> Yes! Edit `config.json` β†’ `auto_routing.keyword_mappings`. + +**Q: How does keyword scoring work?** +> Multi-word phrases get higher weights. "companies like" (2 words) scores higher than "like" (1 word). + +**Q: What if no keywords match?** +> Uses the fallback provider (default: Serper). + +**Q: Can I force a specific provider?** +> Yes, use `-p serper`, `-p tavily`, or `-p exa`. + +### Troubleshooting + +**Error: "Missing API key"** +```bash +# Check if key is set +echo $SERPER_API_KEY + +# Set it +export SERPER_API_KEY="your-key" +``` + +**Error: "API Error (401)"** +> Your API key is invalid or expired. Generate a new one. + +**Error: "API Error (429)"** +> Rate limited. Wait and retry, or upgrade your plan. + +**Empty results?** +> Try a different provider, broaden your query, or remove restrictive filters. + +**Slow responses?** +> Reduce `max_results`, use Tavily `basic`, or use Serper (fastest). + +--- + +## API Reference + +### Output Format + +All providers return unified JSON: + +```json +{ + "provider": "serper|tavily|exa", + "query": "original search query", + "results": [ + { + "title": "Page Title", + "url": "https://example.com/page", + "snippet": "Content excerpt...", + "score": 0.95, + "date": "2024-01-15", + "raw_content": "Full page content (Tavily only)" + } + ], + "images": ["url1", "url2"], + "answer": "Synthesized answer", + "knowledge_graph": { }, + "routing": { + "auto_routed": true, + "selected_provider": "serper", + "reason": "matched_keywords (score=1)", + "matched_keywords": ["price"] + } +} +``` + +### CLI Options Reference + +| Option | Providers | Description | +|--------|-----------|-------------| +| `-q, --query` | All | Search query | +| `-p, --provider` | All | Provider: auto, serper, tavily, querit, exa, perplexity, you, searxng | +| `-n, --max-results` | All | Max results (default: 5) | +| `--auto` | All | Force auto-routing | +| `--explain-routing` | All | Debug auto-routing | +| `--images` | Serper, Tavily | Include images | +| `--country` | Serper, You | Country code (default: us) | +| `--language` | Serper, SearXNG | Language code (default: en) | +| `--type` | Serper | search/news/images/videos/places/shopping | +| `--time-range` | Serper, SearXNG | hour/day/week/month/year | +| `--depth` | Tavily | basic/advanced | +| `--topic` | Tavily | general/news | +| `--raw-content` | Tavily | Include full page content | +| `--querit-base-url` | Querit | Override Querit API base URL | +| `--querit-base-path` | Querit | Override Querit API path | +| `--exa-type` | Exa | neural/keyword | +| `--category` | Exa | company/research paper/news/pdf/github/tweet | +| `--start-date` | Exa | Start date (YYYY-MM-DD) | +| `--end-date` | Exa | End date (YYYY-MM-DD) | +| `--similar-url` | Exa | Find similar pages | +| `--searxng-url` | SearXNG | Instance URL | +| `--searxng-safesearch` | SearXNG | 0=off, 1=moderate, 2=strict | +| `--engines` | SearXNG | Specific engines (google,bing,duckduckgo) | +| `--categories` | SearXNG | Search categories (general,images,news) | +| `--include-domains` | Tavily, Exa | Only these domains | +| `--exclude-domains` | Tavily, Exa | Exclude these domains | +| `--compact` | All | Compact JSON output | + +--- + +## License + +MIT + +--- + +## Links + +- [Serper](https://serper.dev) β€” Google Search API +- [Tavily](https://tavily.com) β€” AI Research Search +- [Exa](https://exa.ai) β€” Neural Search +- [ClawHub](https://clawhub.ai) β€” OpenClaw Skills diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..b5bea2f --- /dev/null +++ b/SKILL.md @@ -0,0 +1,258 @@ +--- +name: web-search-plus +version: 2.9.2 +description: "ε…·ζœ‰ζ™Ίθƒ½θ‡ͺεŠ¨θ·―η”±ηš„η»ŸδΈ€ζœη΄’ζŠ€θƒ½γ€‚" +tags: [search, web-search, serper, tavily, querit, exa, perplexity, you, searxng, google, multilingual-search, research, semantic-search, auto-routing, multi-provider, shopping, rag, free-tier, privacy, self-hosted, kilo] +metadata: {"openclaw":{"requires":{"bins":["python3","bash"],"env":{"SERPER_API_KEY":"optional","TAVILY_API_KEY":"optional","QUERIT_API_KEY":"optional","EXA_API_KEY":"optional","YOU_API_KEY":"optional","SEARXNG_INSTANCE_URL":"optional","KILOCODE_API_KEY":"optional β€” required for Perplexity provider (via Kilo Gateway)"},"note":"Only ONE provider key needed. All are optional."}}} +--- + +# Web Search Plus + +**Stop choosing search providers. Let the skill do it for you.** + +This skill connects you to 7 search providers (Serper, Tavily, Querit, Exa, Perplexity, You.com, SearXNG) and automatically picks the best one for each query. Shopping question? β†’ Google results. Research question? β†’ Deep research engine. Need a direct answer? β†’ AI-synthesized with citations. Want privacy? β†’ Self-hosted option. + +--- + +## ✨ What Makes This Different? + +- **Just search** β€” No need to think about which provider to use +- **Smart routing** β€” Analyzes your query and picks the best provider automatically +- **7 providers, 1 interface** β€” Google results, research engines, neural search, AI answers with citations, RAG-optimized, and privacy-first all in one +- **Works with just 1 key** β€” Start with any single provider, add more later +- **Free options available** β€” SearXNG is completely free (self-hosted) + +--- + +## πŸš€ Quick Start + +```bash +# Interactive setup (recommended for first run) +python3 scripts/setup.py + +# Or manual: copy config and add your keys +cp config.example.json config.json +``` + +The wizard explains each provider, collects API keys, and configures defaults. + +--- + +## πŸ”‘ API Keys + +You only need **ONE** key to get started. Add more providers later for better coverage. + +| Provider | Free Tier | Best For | Sign Up | +|----------|-----------|----------|---------| +| **Serper** | 2,500/mo | Shopping, prices, local, news | [serper.dev](https://serper.dev) | +| **Tavily** | 1,000/mo | Research, explanations, academic | [tavily.com](https://tavily.com) | +| **Querit** | Contact sales/free tier varies | Multilingual AI search, international updates | [querit.ai](https://querit.ai) | +| **Exa** | 1,000/mo | "Similar to X", startups, papers | [exa.ai](https://exa.ai) | +| **Perplexity** | Via Kilo | Direct answers with citations | [kilo.ai](https://kilo.ai) | +| **You.com** | Limited | Real-time info, AI/RAG context | [api.you.com](https://api.you.com) | +| **SearXNG** | **FREE** βœ… | Privacy, multi-source, $0 cost | Self-hosted | + +**Setting your keys:** + +```bash +# Option A: .env file (recommended) +export SERPER_API_KEY="your-key" +export TAVILY_API_KEY="your-key" +export QUERIT_API_KEY="your-key" + +# Option B: config.json +{ "serper": { "api_key": "your-key" } } +``` + +--- + +## 🎯 When to Use Which Provider + +| I want to... | Provider | Example Query | +|--------------|----------|---------------| +| Find product prices | **Serper** | "iPhone 16 Pro Max price" | +| Find restaurants/stores nearby | **Serper** | "best pizza near me" | +| Understand how something works | **Tavily** | "how does HTTPS encryption work" | +| Do deep research | **Tavily** | "climate change research 2024" | +| Search across languages / international updates | **Querit** | "latest AI policy updates in Germany" | +| Find companies like X | **Exa** | "startups similar to Notion" | +| Find research papers | **Exa** | "transformer architecture papers" | +| Get a direct answer with sources | **Perplexity** | "events in Berlin this weekend" | +| Know the current status of something | **Perplexity** | "what is the status of Ethereum upgrades" | +| Get real-time info | **You.com** | "latest AI regulation news" | +| Search without being tracked | **SearXNG** | anything, privately | + +**Pro tip:** Just search normally! Auto-routing handles most queries correctly. Override with `-p provider` when needed. + +--- + +## 🧠 How Auto-Routing Works + +The skill looks at your query and picks the best provider: + +```bash +"iPhone 16 price" β†’ Serper (shopping keywords) +"how does quantum computing work" β†’ Tavily (research question) +"latest AI policy updates in Germany" β†’ Querit (multilingual + recency) +"companies like stripe.com" β†’ Exa (URL detected, similarity) +"events in Graz this weekend" β†’ Perplexity (local + direct answer) +"latest news on AI" β†’ You.com (real-time intent) +"search privately" β†’ SearXNG (privacy keywords) +``` + +**What if it picks wrong?** Override it: `python3 scripts/search.py -p tavily -q "your query"` + +**Debug routing:** `python3 scripts/search.py --explain-routing -q "your query"` + +--- + +## πŸ“– Usage Examples + +### Let Auto-Routing Choose (Recommended) + +```bash +python3 scripts/search.py -q "Tesla Model 3 price" +python3 scripts/search.py -q "explain machine learning" +python3 scripts/search.py -q "latest AI policy updates in Germany" +python3 scripts/search.py -q "startups like Figma" +``` + +### Force a Specific Provider + +```bash +python3 scripts/search.py -p serper -q "weather Berlin" +python3 scripts/search.py -p tavily -q "quantum computing" --depth advanced +python3 scripts/search.py -p querit -q "latest AI policy updates in Germany" +python3 scripts/search.py -p exa --similar-url "https://stripe.com" --category company +python3 scripts/search.py -p you -q "breaking tech news" --include-news +python3 scripts/search.py -p searxng -q "linux distros" --engines "google,bing" +``` + +--- + +## βš™ Configuration + +```json +{ + "auto_routing": { + "enabled": true, + "fallback_provider": "serper", + "confidence_threshold": 0.3, + "disabled_providers": [] + }, + "serper": {"country": "us", "language": "en"}, + "tavily": {"depth": "advanced"}, + "exa": {"type": "neural"}, + "you": {"country": "US", "include_news": true}, + "searxng": {"instance_url": "https://your-instance.example.com"} +} +``` + +--- + +## πŸ“Š Provider Comparison + +| Feature | Serper | Tavily | Exa | Perplexity | You.com | SearXNG | +|---------|:------:|:------:|:---:|:----------:|:-------:|:-------:| +| Speed | ⚑⚑⚑ | ⚑⚑ | ⚑⚑ | ⚑⚑ | ⚑⚑⚑ | ⚑⚑ | +| Direct Answers | βœ— | βœ— | βœ— | βœ“βœ“ | βœ— | βœ— | +| Citations | βœ— | βœ— | βœ— | βœ“ | βœ— | βœ— | +| Factual Accuracy | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | +| Semantic Understanding | ⭐ | ⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | ⭐ | +| Full Page Content | βœ— | βœ“ | βœ“ | βœ“ | βœ“ | βœ— | +| Shopping/Local | βœ“ | βœ— | βœ— | βœ— | βœ— | βœ“ | +| Find Similar Pages | βœ— | βœ— | βœ“ | βœ— | βœ— | βœ— | +| RAG-Optimized | βœ— | βœ“ | βœ— | βœ— | βœ“βœ“ | βœ— | +| Privacy-First | βœ— | βœ— | βœ— | βœ— | βœ— | βœ“βœ“ | +| API Cost | $$ | $$ | $$ | Via Kilo | $ | **FREE** | + +--- + +## ❓ Common Questions + +### Do I need API keys for all providers? +**No.** You only need keys for providers you want to use. Start with one (Serper recommended), add more later. + +### Which provider should I start with? +**Serper** β€” fastest, cheapest, largest free tier (2,500 queries/month), and handles most queries well. + +### What if I run out of free queries? +The skill automatically falls back to your other configured providers. Or switch to SearXNG (unlimited, self-hosted). + +### How much does this cost? +- **Free tiers:** 2,500 (Serper) + 1,000 (Tavily) + 1,000 (Exa) = 4,500+ free searches/month +- **SearXNG:** Completely free (just ~$5/mo if you self-host on a VPS) +- **Paid plans:** Start around $10-50/month depending on provider + +### Is SearXNG really private? +**Yes, if self-hosted.** You control the server, no tracking, no profiling. Public instances depend on the operator's policy. + +### How do I set up SearXNG? +```bash +# Docker (5 minutes) +docker run -d -p 8080:8080 searxng/searxng +``` +Then enable JSON API in `settings.yml`. See [docs.searxng.org](https://docs.searxng.org/admin/installation.html). + +### Why did it route my query to the "wrong" provider? +Sometimes queries are ambiguous. Use `--explain-routing` to see why, then override with `-p provider` if needed. + +--- + +## πŸ”„ Automatic Fallback + +If one provider fails (rate limit, timeout, error), the skill automatically tries the next provider. You'll see `routing.fallback_used: true` in the response when this happens. + +--- + +## πŸ“€ Output Format + +```json +{ + "provider": "serper", + "query": "iPhone 16 price", + "results": [{"title": "...", "url": "...", "snippet": "...", "score": 0.95}], + "routing": { + "auto_routed": true, + "provider": "serper", + "confidence": 0.78, + "confidence_level": "high" + } +} +``` + +--- + +## ⚠ Important Note + +**Tavily, Serper, and Exa are NOT core OpenClaw providers.** + +❌ Don't modify `~/.openclaw/openclaw.json` for these +βœ… Use this skill's scripts β€” keys auto-load from `.env` + +--- + +## πŸ”’ Security + +**SearXNG SSRF Protection:** The SearXNG instance URL is validated with defense-in-depth: +- Enforces `http`/`https` schemes only +- Blocks cloud metadata endpoints (169.254.169.254, metadata.google.internal) +- Resolves hostnames and blocks private/internal IPs (loopback, RFC1918, link-local, reserved) +- Operators who intentionally self-host on private networks can set `SEARXNG_ALLOW_PRIVATE=1` + +## πŸ“š More Documentation + +- **[FAQ.md](FAQ.md)** β€” Detailed answers to more questions +- **[TROUBLESHOOTING.md](TROUBLESHOOTING.md)** β€” Fix common errors +- **[README.md](README.md)** β€” Full technical reference + +--- + +## πŸ”— Quick Links + +- [Serper](https://serper.dev) β€” Google Search API +- [Tavily](https://tavily.com) β€” AI Research Search +- [Exa](https://exa.ai) β€” Neural Search +- [Perplexity](https://www.perplexity.ai) β€” AI-Synthesized Answers (via [Kilo Gateway](https://kilo.ai)) +- [You.com](https://api.you.com) β€” RAG/Real-time Search +- [SearXNG](https://docs.searxng.org) β€” Privacy-First Meta-Search diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..1d233e9 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,315 @@ +# Troubleshooting Guide + +## Caching Issues (v2.7.0+) + +### Cache not working / always fetching fresh + +**Symptoms:** +- Every request hits the API +- `"cached": false` even for repeated queries + +**Solutions:** +1. Check cache directory exists and is writable: + ```bash + ls -la .cache/ # Should exist in skill directory + ``` +2. Verify `--no-cache` isn't being passed +3. Check disk space isn't full +4. Ensure query is EXACTLY the same (including provider and max_results) + +### Stale results from cache + +**Symptoms:** +- Getting outdated information +- Cache TTL seems too long + +**Solutions:** +1. Use `--no-cache` to force fresh results +2. Reduce TTL: `--cache-ttl 1800` (30 minutes) +3. Clear cache: `python3 scripts/search.py --clear-cache` + +### Cache growing too large + +**Symptoms:** +- Disk space filling up +- Many .json files in `.cache/` + +**Solutions:** +1. Clear cache periodically: + ```bash + python3 scripts/search.py --clear-cache + ``` +2. Set up a cron job to clear weekly +3. Use a smaller TTL so entries expire faster + +### "Permission denied" when caching + +**Symptoms:** +- Cache write errors in stderr +- Searches work but don't cache + +**Solutions:** +1. Check directory permissions: `chmod 755 .cache/` +2. Use custom cache dir: `export WSP_CACHE_DIR="$TMP_DIR/wsp-cache"` + +--- + +## Common Issues + +### "No API key found" error + +**Symptoms:** +``` +Error: No API key found for serper +``` + +**Solutions:** +1. Check `.env` exists in skill folder with `export VAR=value` format +2. Keys auto-load from skill's `.env` since v2.2.0 +3. Or set in system environment: `export SERPER_API_KEY="..."` +4. Verify key format in config.json: + ```json + { "serper": { "api_key": "your-key" } } + ``` + +**Priority order:** config.json > .env > environment variable + +--- + +### Getting empty results + +**Symptoms:** +- Search returns no results +- `"results": []` in JSON output + +**Solutions:** +1. Check API key is valid (try the provider's web dashboard) +2. Try a different provider with `-p` +3. Some queries have no results (very niche topics) +4. Check if provider is rate-limited +5. Verify internet connectivity + +**Debug:** +```bash +python3 scripts/search.py -q "test query" --verbose +``` + +--- + +### Rate limited + +**Symptoms:** +``` +Error: 429 Too Many Requests +Error: Rate limit exceeded +``` + +**Good news:** Since v2.2.5, automatic fallback kicks in! If one provider hits rate limits, the script automatically tries the next provider. + +**Solutions:** +1. Wait for rate limit to reset (usually 1 hour or end of day) +2. Use a different provider: `-p tavily` instead of `-p serper` +3. Check free tier limits: + - Serper: 2,500 free total + - Tavily: 1,000/month free + - Exa: 1,000/month free +4. Upgrade to paid tier for higher limits +5. Use SearXNG (self-hosted, unlimited) + +**Fallback info:** Response will include `routing.fallback_used: true` when fallback was used. + +--- + +### SearXNG: "403 Forbidden" + +**Symptoms:** +``` +Error: 403 Forbidden +Error: JSON format not allowed +``` + +**Cause:** Most public SearXNG instances disable JSON API to prevent bot abuse. + +**Solution:** Self-host your own instance: +```bash +docker run -d -p 8080:8080 searxng/searxng +``` + +Then enable JSON in `settings.yml`: +```yaml +search: + formats: + - html + - json # Add this! +``` + +Restart the container and update your config: +```json +{ + "searxng": { + "instance_url": "http://localhost:8080" + } +} +``` + +--- + +### SearXNG: Slow responses + +**Symptoms:** +- SearXNG takes 2-5 seconds +- Other providers are faster + +**Explanation:** This is expected behavior. SearXNG queries 70+ upstream engines in parallel, which takes longer than direct API calls. + +**Trade-off:** Slower but privacy-preserving + multi-source + $0 cost. + +**Solutions:** +1. Accept the trade-off for privacy benefits +2. Limit engines for faster results: + ```bash + python3 scripts/search.py -p searxng -q "query" --engines "google,bing" + ``` +3. Use SearXNG as fallback (put last in priority list) + +--- + +### Auto-routing picks wrong provider + +**Symptoms:** +- Query about research goes to Serper +- Query about shopping goes to Tavily + +**Debug:** +```bash +python3 scripts/search.py --explain-routing -q "your query" +``` + +This shows the full analysis: +```json +{ + "query": "how much does iPhone 16 Pro cost", + "routing_decision": { + "provider": "serper", + "confidence": 0.68, + "reason": "moderate_confidence_match" + }, + "scores": {"serper": 7.0, "tavily": 0.0, "exa": 0.0}, + "top_signals": [ + {"matched": "how much", "weight": 4.0}, + {"matched": "brand + product detected", "weight": 3.0} + ] +} +``` + +**Solutions:** +1. Override with explicit provider: `-p tavily` +2. Rephrase query to be more explicit about intent +3. Adjust `confidence_threshold` in config.json (default: 0.3) + +--- + +### Config not loading + +**Symptoms:** +- Changes to config.json not applied +- Using default values instead + +**Solutions:** +1. Check JSON syntax (use a validator) +2. Ensure file is in skill directory: `/path/to/skills/web-search-plus/config.json` +3. Check file permissions +4. Run setup wizard to regenerate: + ```bash + python3 scripts/setup.py --reset + ``` + +**Validate JSON:** +```bash +python3 -m json.tool config.json +``` + +--- + +### Python dependencies missing + +**Symptoms:** +``` +ModuleNotFoundError: No module named 'requests' +``` + +**Solution:** +```bash +pip3 install requests +``` + +Or install all dependencies: +```bash +pip3 install -r requirements.txt +``` + +--- + +### Timeout errors + +**Symptoms:** +``` +Error: Request timeout after 30s +``` + +**Causes:** +- Slow network connection +- Provider API issues +- SearXNG instance overloaded + +**Solutions:** +1. Try again (temporary issue) +2. Switch provider: `-p serper` +3. Check your internet connection +4. If using SearXNG, check instance health + +--- + +### Duplicate results + +**Symptoms:** +- Same result appears multiple times +- Results overlap between providers + +**Solution:** This is expected when using auto-fallback or multiple providers. The skill doesn't deduplicate across providers. + +For single-provider results: +```bash +python3 scripts/search.py -p serper -q "query" +``` + +--- + +## Debug Mode + +For detailed debugging: + +```bash +# Verbose output +python3 scripts/search.py -q "query" --verbose + +# Show routing decision +python3 scripts/search.py -q "query" --explain-routing + +# Dry run (no actual search) +python3 scripts/search.py -q "query" --dry-run + +# Test specific provider +python3 scripts/search.py -p tavily -q "query" --verbose +``` + +--- + +## Getting Help + +**Still stuck?** + +1. Check the full documentation in `README.md` +2. Run the setup wizard: `python3 scripts/setup.py` +3. Review `FAQ.md` for common questions +4. Open an issue: https://github.com/robbyczgw-cla/web-search-plus/issues diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..84fbf45 --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn73gpe8xz2630jrknkb3ya96h7zb84h", + "slug": "web-search-plus", + "version": "2.9.2", + "publishedAt": 1774629265049 +} \ No newline at end of file diff --git a/config.example.json b/config.example.json new file mode 100644 index 0000000..1c9f98c --- /dev/null +++ b/config.example.json @@ -0,0 +1,265 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$comment": "Web Search Plus configuration β€” intelligent routing and provider settings", + "defaults": { + "provider": "serper", + "max_results": 5 + }, + "auto_routing": { + "enabled": true, + "fallback_provider": "serper", + "provider_priority": [ + "tavily", + "querit", + "exa", + "perplexity", + "serper", + "you", + "searxng" + ], + "disabled_providers": [], + "confidence_threshold": 0.3, + "keyword_mappings": { + "serper": [ + "price", + "buy", + "shop", + "shopping", + "cost", + "deal", + "sale", + "purchase", + "cheap", + "expensive", + "store", + "product", + "review", + "specs", + "specification", + "where to buy", + "near me", + "local", + "restaurant", + "hotel", + "weather", + "news", + "latest", + "breaking", + "map", + "directions", + "phone number", + "preis", + "kaufen", + "bestellen", + "gΓΌnstig", + "billig", + "teuer", + "kosten", + "angebot", + "rabatt", + "shop", + "hΓ€ndler", + "geschΓ€ft", + "laden", + "test", + "bewertung", + "technische daten", + "spezifikationen", + "wo kaufen", + "in der nΓ€he", + "wetter", + "nachrichten", + "aktuell", + "neu" + ], + "tavily": [ + "how does", + "how to", + "explain", + "research", + "what is", + "why does", + "analyze", + "compare", + "study", + "academic", + "detailed", + "comprehensive", + "in-depth", + "understand", + "learn", + "tutorial", + "guide", + "overview", + "history of", + "background", + "context", + "implications", + "pros and cons", + "wie funktioniert", + "erklΓ€rung", + "erklΓ€ren", + "was ist", + "warum", + "analyse", + "vergleich", + "vergleichen", + "studie", + "verstehen", + "lernen", + "anleitung", + "tutorial", + "ΓΌberblick", + "hintergrund", + "vor- und nachteile" + ], + "exa": [ + "similar to", + "companies like", + "find sites like", + "alternatives to", + "competitors", + "startup", + "github", + "paper", + "research paper", + "arxiv", + "pdf", + "academic paper", + "similar pages", + "related sites", + "who else", + "other companies", + "comparable to", + "Γ€hnlich wie", + "firmen wie", + "alternativen zu", + "konkurrenten", + "vergleichbar mit", + "andere unternehmen" + ], + "you": [ + "rag", + "context for", + "summarize", + "brief", + "quick overview", + "tldr", + "key points", + "key facts", + "main points", + "main takeaways", + "latest news", + "latest updates", + "current events", + "current situation", + "current status", + "right now", + "as of today", + "up to date", + "real time", + "what's happening", + "what's the latest", + "updates on", + "status of", + "zusammenfassung", + "aktuelle nachrichten", + "neueste updates" + ], + "searxng": [ + "private", + "privately", + "anonymous", + "anonymously", + "without tracking", + "no tracking", + "privacy", + "privacy-focused", + "privacy-first", + "duckduckgo alternative", + "private search", + "aggregate results", + "multiple sources", + "diverse results", + "diverse perspectives", + "meta search", + "all engines", + "free search", + "no api cost", + "self-hosted search", + "zero cost", + "privat", + "anonym", + "ohne tracking", + "datenschutz", + "verschiedene quellen", + "aus mehreren quellen", + "alle suchmaschinen", + "kostenlose suche", + "keine api kosten" + ], + "querit": [ + "multilingual", + "current status", + "latest updates", + "status of", + "real-time", + "summarize", + "global search", + "cross-language", + "international", + "aktuell", + "zusammenfassung" + ], + "perplexity": [ + "what is", + "current status", + "status of", + "what happened with", + "events in", + "things to do in" + ] + } + }, + "serper": { + "country": "us", + "language": "en", + "type": "search", + "autocorrect": true, + "include_images": false + }, + "tavily": { + "depth": "advanced", + "topic": "general", + "max_results": 8 + }, + "exa": { + "type": "neural", + "category": null, + "include_domains": [], + "exclude_domains": [] + }, + "you": { + "country": "US", + "language": "en", + "safesearch": "moderate", + "include_news": true + }, + "searxng": { + "$comment": "SearXNG requires a self-hosted instance. No API key needed, just your instance URL.", + "instance_url": null, + "safesearch": 0, + "engines": null, + "language": "en" + }, + "querit_api_key": "", + "querit": { + "base_url": "https://api.querit.ai", + "base_path": "/v1/search", + "timeout": 10 + }, + "perplexity": { + "api_url": "https://api.kilo.ai/api/gateway/chat/completions", + "model": "perplexity/sonar-pro" + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..a2f5ce1 --- /dev/null +++ b/package.json @@ -0,0 +1,88 @@ +{ + "name": "@openclaw/web-search-plus", + "version": "2.9.0", + "description": "Unified search skill with Intelligent Auto-Routing. Uses multi-signal analysis (intent classification, linguistic patterns, URL/brand detection) to automatically select between Serper (Google), Tavily (Research), Querit (Multilingual AI Search), Exa (Neural), Perplexity (AI Answers), You.com (RAG/Real-time), and SearXNG (Privacy/Self-hosted) with confidence scoring.", + "keywords": [ + "openclaw", + "skill", + "search", + "web-search", + "serper", + "tavily", + "exa", + "you", + "you.com", + "google-search", + "research", + "semantic-search", + "ai-agent", + "auto-routing", + "smart-routing", + "multi-provider", + "shopping", + "product-search", + "similar-sites", + "company-discovery", + "rag", + "real-time", + "free-tier", + "api-aggregator", + "querit", + "multilingual-search" + ], + "author": "robbyczgw-cla", + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/robbyczgw-cla/web-search-plus.git" + }, + "homepage": "https://clawhub.ai/robbyczgw-cla/web-search-plus", + "bugs": { + "url": "https://github.com/robbyczgw-cla/web-search-plus/issues" + }, + "openclaw": { + "skill": true, + "triggers": [ + "search", + "find", + "look up", + "research" + ], + "capabilities": [ + "web-search", + "image-search", + "semantic-search", + "multi-provider" + ], + "providers": [ + "serper", + "tavily", + "querit", + "exa", + "perplexity", + "you", + "searxng" + ], + "requirements": { + "bins": [ + "python3", + "bash" + ], + "env": { + "SERPER_API_KEY": "optional", + "TAVILY_API_KEY": "optional", + "EXA_API_KEY": "optional", + "YOU_API_KEY": "optional", + "SEARXNG_INSTANCE_URL": "optional", + "QUERIT_API_KEY": "optional", + "KILOCODE_API_KEY": "optional" + } + } + }, + "files": [ + "SKILL.md", + "README.md", + "scripts/", + ".env.example" + ] +} diff --git a/scripts/search.py b/scripts/search.py new file mode 100644 index 0000000..afe2410 --- /dev/null +++ b/scripts/search.py @@ -0,0 +1,2940 @@ +#!/usr/bin/env python3 +""" +Web Search Plus β€” Unified Multi-Provider Search with Intelligent Auto-Routing +Supports: Serper (Google), Tavily (Research), Querit (Multilingual AI Search), +Exa (Neural), Perplexity (Direct Answers) + +Smart Routing uses multi-signal analysis: + - Query intent classification (shopping, research, discovery) + - Linguistic pattern detection (how much vs how does) + - Product/brand recognition + - URL detection + - Confidence scoring + +Usage: + python3 search.py --query "..." # Auto-route based on query + python3 search.py --provider [serper|tavily|querit|exa] --query "..." [options] + +Examples: + python3 search.py -q "iPhone 16 Pro price" # β†’ Serper (shopping intent) + python3 search.py -q "how does quantum entanglement work" # β†’ Tavily (research intent) + python3 search.py -q "startups similar to Notion" # β†’ Exa (discovery intent) +""" + +import argparse +from http.client import IncompleteRead +import hashlib +import json +import os +import re +import sys +import time +from pathlib import Path +from typing import Optional, List, Dict, Any, Tuple +from urllib.request import Request, urlopen +from urllib.error import HTTPError, URLError +from urllib.parse import quote, urlparse + + +# ============================================================================= +# Result Caching +# ============================================================================= + +CACHE_DIR = Path(os.environ.get("WSP_CACHE_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".cache"))) +PROVIDER_HEALTH_FILE = CACHE_DIR / "provider_health.json" +DEFAULT_CACHE_TTL = 3600 # 1 hour in seconds + + +def _build_cache_payload(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Build normalized payload used for cache key hashing.""" + payload = { + "query": query, + "provider": provider, + "max_results": max_results, + } + if params: + payload.update(params) + return payload + + +def _get_cache_key(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> str: + """Generate a unique cache key from all relevant query parameters.""" + payload = _build_cache_payload(query, provider, max_results, params) + key_string = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32] + + +def _get_cache_path(cache_key: str) -> Path: + """Get the file path for a cache entry.""" + return CACHE_DIR / f"{cache_key}.json" + + +def _ensure_cache_dir() -> None: + """Create cache directory if it doesn't exist.""" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +def cache_get(query: str, provider: str, max_results: int, ttl: int = DEFAULT_CACHE_TTL, params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]: + """ + Retrieve cached search results if they exist and are not expired. + + Args: + query: The search query + provider: The search provider + max_results: Maximum results requested + ttl: Time-to-live in seconds (default: 1 hour) + + Returns: + Cached result dict or None if not found/expired + """ + cache_key = _get_cache_key(query, provider, max_results, params) + cache_path = _get_cache_path(cache_key) + + if not cache_path.exists(): + return None + + try: + with open(cache_path, "r", encoding="utf-8") as f: + cached = json.load(f) + + cached_time = cached.get("_cache_timestamp", 0) + if time.time() - cached_time > ttl: + # Cache expired, remove it + cache_path.unlink(missing_ok=True) + return None + + return cached + except (json.JSONDecodeError, IOError, KeyError): + # Corrupted cache file, remove it + cache_path.unlink(missing_ok=True) + return None + + +def cache_put(query: str, provider: str, max_results: int, result: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> None: + """ + Store search results in cache. + + Args: + query: The search query + provider: The search provider + max_results: Maximum results requested + result: The search result to cache + """ + _ensure_cache_dir() + + cache_key = _get_cache_key(query, provider, max_results, params) + cache_path = _get_cache_path(cache_key) + + # Add cache metadata + cached_result = result.copy() + cached_result["_cache_timestamp"] = time.time() + cached_result["_cache_key"] = cache_key + cached_result["_cache_query"] = query + cached_result["_cache_provider"] = provider + cached_result["_cache_max_results"] = max_results + cached_result["_cache_params"] = params or {} + + try: + with open(cache_path, "w", encoding="utf-8") as f: + json.dump(cached_result, f, ensure_ascii=False, indent=2) + except IOError as e: + # Non-fatal: log to stderr but don't fail + print(json.dumps({"cache_write_error": str(e)}), file=sys.stderr) + + +def cache_clear() -> Dict[str, Any]: + """ + Clear all cached results. + + Returns: + Stats about what was cleared + """ + if not CACHE_DIR.exists(): + return {"cleared": 0, "message": "Cache directory does not exist"} + + count = 0 + size_freed = 0 + + for cache_file in CACHE_DIR.glob("*.json"): + if cache_file.name == PROVIDER_HEALTH_FILE.name: + continue + try: + size_freed += cache_file.stat().st_size + cache_file.unlink() + count += 1 + except IOError: + pass + + return { + "cleared": count, + "size_freed_bytes": size_freed, + "size_freed_kb": round(size_freed / 1024, 2), + "message": f"Cleared {count} cached entries" + } + + +def cache_stats() -> Dict[str, Any]: + """ + Get statistics about the cache. + + Returns: + Dict with cache statistics + """ + if not CACHE_DIR.exists(): + return { + "total_entries": 0, + "total_size_bytes": 0, + "total_size_kb": 0, + "oldest": None, + "newest": None, + "cache_dir": str(CACHE_DIR), + "exists": False + } + + entries = [p for p in CACHE_DIR.glob("*.json") if p.name != PROVIDER_HEALTH_FILE.name] + total_size = 0 + oldest_time = None + newest_time = None + oldest_query = None + newest_query = None + provider_counts = {} + + for cache_file in entries: + try: + stat = cache_file.stat() + total_size += stat.st_size + + with open(cache_file, "r", encoding="utf-8") as f: + cached = json.load(f) + + ts = cached.get("_cache_timestamp", 0) + query = cached.get("_cache_query", "unknown") + provider = cached.get("_cache_provider", "unknown") + + provider_counts[provider] = provider_counts.get(provider, 0) + 1 + + if oldest_time is None or ts < oldest_time: + oldest_time = ts + oldest_query = query + if newest_time is None or ts > newest_time: + newest_time = ts + newest_query = query + except (json.JSONDecodeError, IOError): + pass + + return { + "total_entries": len(entries), + "total_size_bytes": total_size, + "total_size_kb": round(total_size / 1024, 2), + "providers": provider_counts, + "oldest": { + "timestamp": oldest_time, + "age_seconds": int(time.time() - oldest_time) if oldest_time else None, + "query": oldest_query + } if oldest_time else None, + "newest": { + "timestamp": newest_time, + "age_seconds": int(time.time() - newest_time) if newest_time else None, + "query": newest_query + } if newest_time else None, + "cache_dir": str(CACHE_DIR), + "exists": True + } + + +# ============================================================================= +# Auto-load .env from skill directory (if exists) +# ============================================================================= +def _load_env_file(): + """Load .env file from skill root directory if it exists.""" + env_path = Path(__file__).parent.parent / ".env" + if env_path.exists(): + with open(env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + # Handle export VAR=value or VAR=value + if line.startswith("export "): + line = line[7:] + key, _, value = line.partition("=") + key = key.strip() + value = value.strip().strip('"').strip("'") + if key and key not in os.environ: + os.environ[key] = value + +_load_env_file() + + +# ============================================================================= +# Configuration +# ============================================================================= + +DEFAULT_CONFIG = { + "defaults": { + "provider": "serper", + "max_results": 5 + }, + "auto_routing": { + "enabled": True, + "fallback_provider": "serper", + "provider_priority": ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"], + "disabled_providers": [], + "confidence_threshold": 0.3, # Below this, note low confidence + }, + "serper": { + "country": "us", + "language": "en", + "type": "search" + }, + "tavily": { + "depth": "basic", + "topic": "general" + }, + "querit": { + "base_url": "https://api.querit.ai", + "base_path": "/v1/search", + "timeout": 10 + }, + "exa": { + "type": "neural", + "depth": "normal", + "verbosity": "standard" + }, + "perplexity": { + "api_url": "https://api.kilo.ai/api/gateway/chat/completions", + "model": "perplexity/sonar-pro" + }, + "you": { + "country": "us", + "safesearch": "moderate" + }, + "searxng": { + "instance_url": None, # Required - user must set their own instance + "safesearch": 0, # 0=off, 1=moderate, 2=strict + "engines": None, # Optional list of engines to use + "language": "en" + } +} + + +def load_config() -> Dict[str, Any]: + """Load configuration from config.json if it exists, with defaults.""" + config = DEFAULT_CONFIG.copy() + config_path = Path(__file__).parent.parent / "config.json" + + if config_path.exists(): + try: + with open(config_path) as f: + user_config = json.load(f) + for key, value in user_config.items(): + if isinstance(value, dict) and key in config: + config[key] = {**config.get(key, {}), **value} + else: + config[key] = value + except (json.JSONDecodeError, IOError) as e: + print(json.dumps({ + "warning": f"Could not load config.json: {e}", + "using": "default configuration" + }), file=sys.stderr) + + return config + + +def get_api_key(provider: str, config: Dict[str, Any] = None) -> Optional[str]: + """Get API key for provider from config.json or environment. + + Priority: config.json > .env > environment variable + + Note: SearXNG doesn't require an API key, but returns instance_url if configured. + """ + # Special case: SearXNG uses instance_url instead of API key + if provider == "searxng": + return get_searxng_instance_url(config) + + # Check config.json first + if config: + provider_config = config.get(provider, {}) + if isinstance(provider_config, dict): + key = provider_config.get("api_key") or provider_config.get("apiKey") + if key: + return key + + # Then check environment + if provider == "perplexity": + return os.environ.get("PERPLEXITY_API_KEY") or os.environ.get("KILOCODE_API_KEY") + key_map = { + "serper": "SERPER_API_KEY", + "tavily": "TAVILY_API_KEY", + "querit": "QUERIT_API_KEY", + "exa": "EXA_API_KEY", + "you": "YOU_API_KEY", + } + return os.environ.get(key_map.get(provider, "")) + + +def _validate_searxng_url(url: str) -> str: + """Validate and sanitize SearXNG instance URL to prevent SSRF. + + Enforces http/https scheme and blocks requests to private/internal networks + including cloud metadata endpoints, loopback, link-local, and RFC1918 ranges. + """ + import ipaddress + import socket + from urllib.parse import urlparse + + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"SearXNG URL must use http or https scheme, got: {parsed.scheme}") + if not parsed.hostname: + raise ValueError("SearXNG URL must include a hostname") + + hostname = parsed.hostname + + # Block cloud metadata endpoints by hostname + BLOCKED_HOSTS = { + "169.254.169.254", # AWS/GCP/Azure metadata + "metadata.google.internal", + "metadata.internal", + } + if hostname in BLOCKED_HOSTS: + raise ValueError(f"SearXNG URL blocked: {hostname} is a cloud metadata endpoint") + + # Resolve hostname and check for private/internal IPs + # Operators who intentionally self-host on private networks can opt out + allow_private = os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1" + if not allow_private: + try: + resolved_ips = socket.getaddrinfo(hostname, parsed.port or 80, proto=socket.IPPROTO_TCP) + for family, _type, _proto, _canonname, sockaddr in resolved_ips: + ip = ipaddress.ip_address(sockaddr[0]) + if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved: + raise ValueError( + f"SearXNG URL blocked: {hostname} resolves to private/internal IP {ip}. " + f"If this is intentional, set SEARXNG_ALLOW_PRIVATE=1 in your environment." + ) + except socket.gaierror: + raise ValueError(f"SearXNG URL blocked: cannot resolve hostname {hostname}") + + return url + + +def get_searxng_instance_url(config: Dict[str, Any] = None) -> Optional[str]: + """Get SearXNG instance URL from config or environment. + + SearXNG is self-hosted, so no API key needed - just the instance URL. + Priority: config.json > SEARXNG_INSTANCE_URL environment variable + + Security: URL is validated to prevent SSRF via scheme enforcement. + Both config sources (config.json, env var) are operator-controlled, + not agent-controlled, so private IPs like localhost are permitted. + """ + # Check config.json first + if config: + searxng_config = config.get("searxng", {}) + if isinstance(searxng_config, dict): + url = searxng_config.get("instance_url") + if url: + return _validate_searxng_url(url) + + # Then check environment + env_url = os.environ.get("SEARXNG_INSTANCE_URL") + if env_url: + return _validate_searxng_url(env_url) + return None + + +# Backward compatibility alias +def get_env_key(provider: str) -> Optional[str]: + """Get API key for provider from environment (legacy function).""" + return get_api_key(provider) + + +def validate_api_key(provider: str, config: Dict[str, Any] = None) -> str: + """Validate and return API key (or instance URL for SearXNG), with helpful error messages.""" + key = get_api_key(provider, config) + + # Special handling for SearXNG - it needs instance URL, not API key + if provider == "searxng": + if not key: + error_msg = { + "error": "Missing SearXNG instance URL", + "env_var": "SEARXNG_INSTANCE_URL", + "how_to_fix": [ + "1. Set up your own SearXNG instance: https://docs.searxng.org/admin/installation.html", + "2. Add to config.json: \"searxng\": {\"instance_url\": \"https://your-instance.example.com\"}", + "3. Or set environment variable: export SEARXNG_INSTANCE_URL=\"https://your-instance.example.com\"", + "Note: SearXNG requires a self-hosted instance with JSON format enabled.", + ], + "provider": provider + } + raise ProviderConfigError(json.dumps(error_msg)) + + # Validate URL format + if not key.startswith(("http://", "https://")): + raise ProviderConfigError(json.dumps({ + "error": "SearXNG instance URL must start with http:// or https://", + "provided": key, + "provider": provider + })) + + return key + + if not key: + env_var = { + "serper": "SERPER_API_KEY", + "tavily": "TAVILY_API_KEY", + "querit": "QUERIT_API_KEY", + "exa": "EXA_API_KEY", + "you": "YOU_API_KEY", + "perplexity": "KILOCODE_API_KEY" + }[provider] + + urls = { + "serper": "https://serper.dev", + "tavily": "https://tavily.com", + "querit": "https://querit.ai", + "exa": "https://exa.ai", + "you": "https://api.you.com", + "perplexity": "https://api.kilo.ai" + } + + error_msg = { + "error": f"Missing API key for {provider}", + "env_var": env_var, + "how_to_fix": [ + f"1. Get your API key from {urls[provider]}", + f"2. Add to config.json: \"{provider}\": {{\"api_key\": \"your-key\"}}", + f"3. Or set environment variable: export {env_var}=\"your-key\"", + ], + "provider": provider + } + raise ProviderConfigError(json.dumps(error_msg)) + + if len(key) < 10: + raise ProviderConfigError(json.dumps({ + "error": f"API key for {provider} appears invalid (too short)", + "provider": provider + })) + + return key + + +# ============================================================================= +# Intelligent Auto-Routing Engine +# ============================================================================= + +class QueryAnalyzer: + """ + Intelligent query analysis for smart provider routing. + + Uses multi-signal analysis: + - Intent classification (shopping, research, discovery, local, news) + - Linguistic patterns (question structure, phrase patterns) + - Entity detection (products, brands, URLs, dates) + - Complexity assessment + """ + + # Intent signal patterns with weights + # Higher weight = stronger signal for that provider + + SHOPPING_SIGNALS = { + # Price patterns (very strong) + r'\bhow much\b': 4.0, + r'\bprice of\b': 4.0, + r'\bcost of\b': 4.0, + r'\bprices?\b': 3.0, + r'\$\d+|\d+\s*dollars?': 3.0, + r'€\d+|\d+\s*euros?': 3.0, + r'Β£\d+|\d+\s*pounds?': 3.0, + + # German price patterns (sehr stark) + r'\bpreis(e)?\b': 3.5, + r'\bkosten\b': 3.0, + r'\bwieviel\b': 3.5, + r'\bwie viel\b': 3.5, + r'\bwas kostet\b': 4.0, + + # Purchase intent (strong) + r'\bbuy\b': 3.5, + r'\bpurchase\b': 3.5, + r'\border\b(?!\s+by)': 3.0, # "order" but not "order by" + r'\bshopping\b': 3.5, + r'\bshop for\b': 3.5, + r'\bwhere to (buy|get|purchase)\b': 4.0, + + # German purchase intent (stark) + r'\bkaufen\b': 3.5, + r'\bbestellen\b': 3.5, + r'\bwo kaufen\b': 4.0, + r'\bhΓ€ndler\b': 3.0, + r'\bshop\b': 2.5, + + # Deal/discount signals + r'\bdeal(s)?\b': 3.0, + r'\bdiscount(s)?\b': 3.0, + r'\bsale\b': 2.5, + r'\bcheap(er|est)?\b': 3.0, + r'\baffordable\b': 2.5, + r'\bbudget\b': 2.5, + r'\bbest price\b': 3.5, + r'\bcompare prices\b': 3.5, + r'\bcoupon\b': 3.0, + + # German deal/discount signals + r'\bgΓΌnstig(er|ste)?\b': 3.0, + r'\bbillig(er|ste)?\b': 3.0, + r'\bangebot(e)?\b': 3.0, + r'\brabatt\b': 3.0, + r'\baktion\b': 2.5, + r'\bschnΓ€ppchen\b': 3.0, + + # Product comparison + r'\bvs\.?\b': 2.0, + r'\bversus\b': 2.0, + r'\bor\b.*\bwhich\b': 2.0, + r'\bspecs?\b': 2.5, + r'\bspecifications?\b': 2.5, + r'\breview(s)?\b': 2.0, + r'\brating(s)?\b': 2.0, + r'\bunboxing\b': 2.5, + + # German product comparison + r'\btest\b': 2.5, + r'\bbewertung(en)?\b': 2.5, + r'\btechnische daten\b': 3.0, + r'\bspezifikationen\b': 2.5, + } + + RESEARCH_SIGNALS = { + # Explanation patterns (very strong) + r'\bhow does\b': 4.0, + r'\bhow do\b': 3.5, + r'\bwhy does\b': 4.0, + r'\bwhy do\b': 3.5, + r'\bwhy is\b': 3.5, + r'\bexplain\b': 4.0, + r'\bexplanation\b': 4.0, + r'\bwhat is\b': 3.0, + r'\bwhat are\b': 3.0, + r'\bdefine\b': 3.5, + r'\bdefinition of\b': 3.5, + r'\bmeaning of\b': 3.0, + + # Analysis patterns (strong) + r'\banalyze\b': 3.5, + r'\banalysis\b': 3.5, + r'\bcompare\b(?!\s*prices?)': 3.0, # compare but not "compare prices" + r'\bcomparison\b': 3.0, + r'\bstatus of\b': 3.5, + r'\bstatus\b': 2.5, + r'\bwhat happened with\b': 4.0, + r'\bpros and cons\b': 4.0, + r'\badvantages?\b': 3.0, + r'\bdisadvantages?\b': 3.0, + r'\bbenefits?\b': 2.5, + r'\bdrawbacks?\b': 3.0, + r'\bdifference between\b': 3.5, + + # Learning patterns + r'\bunderstand\b': 3.0, + r'\blearn(ing)?\b': 2.5, + r'\btutorial\b': 3.0, + r'\bguide\b': 2.5, + r'\bhow to\b': 2.0, # Lower weight - could be shopping too + r'\bstep by step\b': 3.0, + + # Depth signals + r'\bin[- ]depth\b': 3.0, + r'\bdetailed\b': 2.5, + r'\bcomprehensive\b': 3.0, + r'\bthorough\b': 2.5, + r'\bdeep dive\b': 3.5, + r'\boverall\b': 2.0, + r'\bsummary\b': 2.0, + + # Academic patterns + r'\bstudy\b': 2.5, + r'\bresearch shows\b': 3.5, + r'\baccording to\b': 2.5, + r'\bevidence\b': 3.0, + r'\bscientific\b': 3.0, + r'\bhistory of\b': 3.0, + r'\bbackground\b': 2.5, + r'\bcontext\b': 2.5, + r'\bimplications?\b': 3.0, + + # German explanation patterns (sehr stark) + r'\bwie funktioniert\b': 4.0, + r'\bwarum\b': 3.5, + r'\berklΓ€r(en|ung)?\b': 4.0, + r'\bwas ist\b': 3.0, + r'\bwas sind\b': 3.0, + r'\bbedeutung\b': 3.0, + + # German analysis patterns + r'\banalyse\b': 3.5, + r'\bvergleich(en)?\b': 3.0, + r'\bvor- und nachteile\b': 4.0, + r'\bvorteile\b': 3.0, + r'\bnachteile\b': 3.0, + r'\bunterschied(e)?\b': 3.5, + + # German learning patterns + r'\bverstehen\b': 3.0, + r'\blernen\b': 2.5, + r'\banleitung\b': 3.0, + r'\bΓΌbersicht\b': 2.5, + r'\bhintergrund\b': 2.5, + r'\bzusammenfassung\b': 2.5, + } + + DISCOVERY_SIGNALS = { + # Similarity patterns (very strong) + r'\bsimilar to\b': 5.0, + r'\blike\s+\w+\.com': 4.5, # "like notion.com" + r'\balternatives? to\b': 5.0, + r'\bcompetitors? (of|to)\b': 4.5, + r'\bcompeting with\b': 4.0, + r'\brivals? (of|to)\b': 4.0, + r'\binstead of\b': 3.0, + r'\breplacement for\b': 3.5, + + # Company/startup patterns (strong) + r'\bcompanies (like|that|doing|building)\b': 4.5, + r'\bstartups? (like|that|doing|building)\b': 4.5, + r'\bwho else\b': 4.0, + r'\bother (companies|startups|tools|apps)\b': 3.5, + r'\bfind (companies|startups|tools|examples?)\b': 4.5, + r'\bevents? in\b': 4.0, + r'\bthings to do in\b': 4.5, + + # Funding/business patterns + r'\bseries [a-d]\b': 4.0, + r'\byc\b|y combinator': 4.0, + r'\bfund(ed|ing|raise)\b': 3.5, + r'\bventure\b': 3.0, + r'\bvaluation\b': 3.0, + + # Category patterns + r'\bresearch papers? (on|about)\b': 4.0, + r'\barxiv\b': 4.5, + r'\bgithub (projects?|repos?)\b': 4.5, + r'\bopen source\b.*\bprojects?\b': 4.0, + r'\btweets? (about|on)\b': 3.5, + r'\bblogs? (about|on|like)\b': 3.0, + + # URL detection (very strong signal for Exa similar) + r'https?://[^\s]+': 5.0, + r'\b\w+\.(com|org|io|ai|co|dev)\b': 3.5, + } + + LOCAL_NEWS_SIGNALS = { + # Local patterns β†’ Serper + r'\bnear me\b': 4.0, + r'\bnearby\b': 3.5, + r'\blocal\b': 3.0, + r'\bin (my )?(city|area|town|neighborhood)\b': 3.5, + r'\brestaurants?\b': 2.5, + r'\bhotels?\b': 2.5, + r'\bcafes?\b': 2.5, + r'\bstores?\b': 2.0, + r'\bdirections? to\b': 3.5, + r'\bmap of\b': 3.0, + r'\bphone number\b': 3.0, + r'\baddress of\b': 3.0, + r'\bopen(ing)? hours\b': 3.0, + + # Weather/time + r'\bweather\b': 4.0, + r'\bforecast\b': 3.5, + r'\btemperature\b': 3.0, + r'\btime in\b': 3.0, + + # News/recency patterns β†’ Serper (or Tavily for news depth) + r'\blatest\b': 2.5, + r'\brecent\b': 2.5, + r'\btoday\b': 2.5, + r'\bbreaking\b': 3.5, + r'\bnews\b': 2.5, + r'\bheadlines?\b': 3.0, + r'\b202[4-9]\b': 2.0, # Current year mentions + r'\blast (week|month|year)\b': 2.0, + + # German local patterns + r'\bin der nΓ€he\b': 4.0, + r'\bin meiner nΓ€he\b': 4.0, + r'\bΓΆffnungszeiten\b': 3.0, + r'\badresse von\b': 3.0, + r'\bweg(beschreibung)? nach\b': 3.5, + + # German news/recency patterns + r'\bheute\b': 2.5, + r'\bmorgen\b': 2.0, + r'\baktuell\b': 2.5, + r'\bnachrichten\b': 3.0, + } + + # RAG/AI signals β†’ You.com + # You.com excels at providing LLM-ready snippets and combined web+news + RAG_SIGNALS = { + # RAG/context patterns (strong signal for You.com) + r'\brag\b': 4.5, + r'\bcontext for\b': 4.0, + r'\bsummarize\b': 3.5, + r'\bbrief(ly)?\b': 3.0, + r'\bquick overview\b': 3.5, + r'\btl;?dr\b': 4.0, + r'\bkey (points|facts|info)\b': 3.5, + r'\bmain (points|takeaways)\b': 3.5, + + # Combined web + news queries + r'\b(web|online)\s+and\s+news\b': 4.0, + r'\ball sources\b': 3.5, + r'\bcomprehensive (search|overview)\b': 3.5, + r'\blatest\s+(news|updates)\b': 3.0, + r'\bcurrent (events|situation|status)\b': 3.5, + + # Real-time information needs + r'\bright now\b': 3.0, + r'\bas of today\b': 3.5, + r'\bup.to.date\b': 3.5, + r'\breal.time\b': 4.0, + r'\blive\b': 2.5, + + # Information synthesis + r'\bwhat\'?s happening with\b': 3.5, + r'\bwhat\'?s the latest\b': 4.0, + r'\bupdates?\s+on\b': 3.5, + r'\bstatus of\b': 3.0, + r'\bsituation (in|with|around)\b': 3.5, + } + + # Direct answer / synthesis signals β†’ Perplexity via Kilo Gateway + DIRECT_ANSWER_SIGNALS = { + r'\bwhat is\b': 3.0, + r'\bwhat are\b': 2.5, + r'\bcurrent status\b': 4.0, + r'\bstatus of\b': 3.5, + r'\bstatus\b': 2.5, + r'\bwhat happened with\b': 4.0, + r"\bwhat'?s happening with\b": 4.0, + r'\bas of (today|now)\b': 4.0, + r'\bthis weekend\b': 3.5, + r'\bevents? in\b': 3.5, + r'\bthings to do in\b': 4.0, + r'\bnear me\b': 3.0, + r'\bcan you (tell me|summarize|explain)\b': 3.5, + # German + r'\bwann\b': 3.0, + r'\bwer\b': 3.0, + r'\bwo\b': 2.5, + r'\bwie viele\b': 3.0, + } + + # Privacy/Multi-source signals β†’ SearXNG (self-hosted meta-search) + # SearXNG is ideal for privacy-focused queries and aggregating multiple sources + PRIVACY_SIGNALS = { + # Privacy signals (very strong) + r'\bprivate(ly)?\b': 4.0, + r'\banonymous(ly)?\b': 4.0, + r'\bwithout tracking\b': 4.5, + r'\bno track(ing)?\b': 4.5, + r'\bprivacy\b': 3.5, + r'\bprivacy.?focused\b': 4.5, + r'\bprivacy.?first\b': 4.5, + r'\bduckduckgo alternative\b': 4.5, + r'\bprivate search\b': 5.0, + + # German privacy signals + r'\bprivat\b': 4.0, + r'\banonym\b': 4.0, + r'\bohne tracking\b': 4.5, + r'\bdatenschutz\b': 4.0, + + # Multi-source aggregation signals + r'\baggregate results?\b': 4.0, + r'\bmultiple sources?\b': 4.0, + r'\bdiverse (results|perspectives|sources)\b': 4.0, + r'\bfrom (all|multiple|different) (engines?|sources?)\b': 4.5, + r'\bmeta.?search\b': 5.0, + r'\ball engines?\b': 4.0, + + # German multi-source signals + r'\bverschiedene quellen\b': 4.0, + r'\baus mehreren quellen\b': 4.0, + r'\balle suchmaschinen\b': 4.5, + + # Budget/free signals (SearXNG is self-hosted = $0 API cost) + r'\bfree search\b': 3.5, + r'\bno api cost\b': 4.0, + r'\bself.?hosted search\b': 5.0, + r'\bzero cost\b': 3.5, + r'\bbudget\b(?!\s*(laptop|phone|option))\b': 2.5, # "budget" alone, not "budget laptop" + + # German budget signals + r'\bkostenlos(e)?\s+suche\b': 3.5, + r'\bkeine api.?kosten\b': 4.0, + } + + # Exa Deep Search signals β†’ deep multi-source synthesis + EXA_DEEP_SIGNALS = { + r'\bsynthesi[sz]e\b': 5.0, + r'\bdeep research\b': 5.0, + r'\bcomprehensive (analysis|report|overview|survey)\b': 4.5, + r'\bacross (multiple|many|several) (sources|documents|papers)\b': 4.5, + r'\baggregat(e|ing) (information|data|results)\b': 4.0, + r'\bcross.?referenc': 4.5, + r'\bsec filings?\b': 4.5, + r'\bannual reports?\b': 4.0, + r'\bearnings (call|report|transcript)\b': 4.5, + r'\bfinancial analysis\b': 4.0, + r'\bliterature (review|survey)\b': 5.0, + r'\bacademic literature\b': 4.5, + r'\bstate of the (art|field|industry)\b': 4.0, + r'\bcompile (a |the )?(report|findings|results)\b': 4.5, + r'\bsummariz(e|ing) (research|papers|studies)\b': 4.0, + r'\bmultiple documents?\b': 4.0, + r'\bdossier\b': 4.5, + r'\bdue diligence\b': 4.5, + r'\bstructured (output|data|report)\b': 4.0, + r'\bmarket research\b': 4.0, + r'\bindustry (report|analysis|overview)\b': 4.0, + r'\bresearch (on|about|into)\b': 4.0, + r'\bwhitepaper\b': 4.5, + r'\btechnical report\b': 4.0, + r'\bsurvey of\b': 4.5, + r'\bmeta.?analysis\b': 5.0, + r'\bsystematic review\b': 5.0, + r'\bcase study\b': 3.5, + r'\bbenchmark(s|ing)?\b': 3.5, + # German + r'\btiefenrecherche\b': 5.0, + r'\bumfassende (analyse|ΓΌbersicht|recherche)\b': 4.5, + r'\baus mehreren quellen zusammenfassen\b': 4.5, + r'\bmarktforschung\b': 4.0, + } + + # Exa Deep Reasoning signals β†’ complex cross-reference analysis + EXA_DEEP_REASONING_SIGNALS = { + r'\bdeep.?reasoning\b': 6.0, + r'\bcomplex (analysis|reasoning|research)\b': 4.5, + r'\bcontradictions?\b': 4.5, + r'\breconcil(e|ing)\b': 5.0, + r'\bcritical(ly)? analyz': 4.5, + r'\bweigh(ing)? (the )?evidence\b': 4.5, + r'\bcompeting (claims|theories|perspectives)\b': 4.5, + r'\bcomplex financial\b': 4.5, + r'\bregulatory (analysis|compliance|landscape)\b': 4.5, + r'\blegal analysis\b': 4.5, + r'\bcomprehensive (due diligence|investigation)\b': 5.0, + r'\bpatent (landscape|analysis|search)\b': 4.5, + r'\bmarket intelligence\b': 4.5, + r'\bcompetitive (intelligence|landscape)\b': 4.5, + r'\btrade.?offs?\b': 4.0, + r'\bpros and cons of\b': 4.0, + r'\bshould I (use|choose|pick)\b': 3.5, + r'\bwhich is better\b': 4.0, + # German + r'\bkomplexe analyse\b': 4.5, + r'\bwidersprΓΌche\b': 4.5, + r'\bquellen abwΓ€gen\b': 4.5, + r'\brechtliche analyse\b': 4.5, + r'\bvergleich(e|en)?\b': 3.5, + } + + + # Brand/product patterns for shopping detection + BRAND_PATTERNS = [ + # Tech brands + r'\b(apple|iphone|ipad|macbook|airpods?)\b', + r'\b(samsung|galaxy)\b', + r'\b(google|pixel)\b', + r'\b(microsoft|surface|xbox)\b', + r'\b(sony|playstation)\b', + r'\b(nvidia|geforce|rtx)\b', + r'\b(amd|ryzen|radeon)\b', + r'\b(intel|core i[3579])\b', + r'\b(dell|hp|lenovo|asus|acer)\b', + r'\b(lg|tcl|hisense)\b', + + # Product categories + r'\b(laptop|phone|tablet|tv|monitor|headphones?|earbuds?)\b', + r'\b(camera|lens|drone)\b', + r'\b(watch|smartwatch|fitbit|garmin)\b', + r'\b(router|modem|wifi)\b', + r'\b(keyboard|mouse|gaming)\b', + ] + + def __init__(self, config: Dict[str, Any]): + self.config = config + self.auto_config = config.get("auto_routing", DEFAULT_CONFIG["auto_routing"]) + + def _calculate_signal_score( + self, + query: str, + signals: Dict[str, float] + ) -> Tuple[float, List[Dict[str, Any]]]: + """ + Calculate score for a signal category. + Returns (total_score, list of matched signals with details). + """ + query_lower = query.lower() + matches = [] + total_score = 0.0 + + for pattern, weight in signals.items(): + regex = re.compile(pattern, re.IGNORECASE) + found = regex.findall(query_lower) + if found: + # Normalize found matches + match_text = found[0] if isinstance(found[0], str) else found[0][0] if found[0] else pattern + matches.append({ + "pattern": pattern, + "matched": match_text, + "weight": weight + }) + total_score += weight + + return total_score, matches + + def _detect_product_brand_combo(self, query: str) -> float: + """ + Detect product + brand combinations which strongly indicate shopping intent. + Returns a bonus score. + """ + query_lower = query.lower() + brand_found = False + product_found = False + + for pattern in self.BRAND_PATTERNS: + if re.search(pattern, query_lower, re.IGNORECASE): + brand_found = True + break + + # Check for product indicators + product_indicators = [ + r'\b(buy|price|specs?|review|vs|compare)\b', + r'\b(pro|max|plus|mini|ultra|lite)\b', # Product tier names + r'\b\d+\s*(gb|tb|inch|mm|hz)\b', # Specifications + ] + for pattern in product_indicators: + if re.search(pattern, query_lower, re.IGNORECASE): + product_found = True + break + + if brand_found and product_found: + return 3.0 # Strong shopping signal + elif brand_found: + return 1.5 # Moderate shopping signal + return 0.0 + + def _detect_url(self, query: str) -> Optional[str]: + """Detect URLs in query - strong signal for Exa similar search.""" + url_pattern = r'https?://[^\s]+' + match = re.search(url_pattern, query) + if match: + return match.group() + + # Also check for domain-like patterns + domain_pattern = r'\b(\w+\.(com|org|io|ai|co|dev|net|app))\b' + match = re.search(domain_pattern, query, re.IGNORECASE) + if match: + return match.group() + + return None + + def _assess_query_complexity(self, query: str) -> Dict[str, Any]: + """ + Assess query complexity - complex queries favor Tavily. + """ + words = query.split() + word_count = len(words) + + # Count question words + question_words = len(re.findall( + r'\b(what|why|how|when|where|which|who|whose|whom)\b', + query, re.IGNORECASE + )) + + # Check for multiple clauses + clause_markers = len(re.findall( + r'\b(and|but|or|because|since|while|although|if|when)\b', + query, re.IGNORECASE + )) + + complexity_score = 0.0 + if word_count > 10: + complexity_score += 1.5 + if word_count > 20: + complexity_score += 1.0 + if question_words > 1: + complexity_score += 1.0 + if clause_markers > 0: + complexity_score += 0.5 * clause_markers + + return { + "word_count": word_count, + "question_words": question_words, + "clause_markers": clause_markers, + "complexity_score": complexity_score, + "is_complex": complexity_score > 2.0 + } + + def _detect_recency_intent(self, query: str) -> Tuple[bool, float]: + """ + Detect if query wants recent/timely information. + Returns (is_recency_focused, score). + """ + recency_patterns = [ + (r'\b(latest|newest|recent|current)\b', 2.5), + (r'\b(today|yesterday|this week|this month)\b', 3.0), + (r'\b(202[4-9]|2030)\b', 2.0), + (r'\b(breaking|live|just|now)\b', 3.0), + (r'\blast (hour|day|week|month)\b', 2.5), + ] + + total = 0.0 + for pattern, weight in recency_patterns: + if re.search(pattern, query, re.IGNORECASE): + total += weight + + return total > 2.0, total + + def analyze(self, query: str) -> Dict[str, Any]: + """ + Perform comprehensive query analysis. + Returns detailed analysis with scores for each provider. + """ + # Calculate scores for each intent category + shopping_score, shopping_matches = self._calculate_signal_score( + query, self.SHOPPING_SIGNALS + ) + research_score, research_matches = self._calculate_signal_score( + query, self.RESEARCH_SIGNALS + ) + discovery_score, discovery_matches = self._calculate_signal_score( + query, self.DISCOVERY_SIGNALS + ) + local_news_score, local_news_matches = self._calculate_signal_score( + query, self.LOCAL_NEWS_SIGNALS + ) + rag_score, rag_matches = self._calculate_signal_score( + query, self.RAG_SIGNALS + ) + privacy_score, privacy_matches = self._calculate_signal_score( + query, self.PRIVACY_SIGNALS + ) + direct_answer_score, direct_answer_matches = self._calculate_signal_score( + query, self.DIRECT_ANSWER_SIGNALS + ) + exa_deep_score, exa_deep_matches = self._calculate_signal_score( + query, self.EXA_DEEP_SIGNALS + ) + exa_deep_reasoning_score, exa_deep_reasoning_matches = self._calculate_signal_score( + query, self.EXA_DEEP_REASONING_SIGNALS + ) + + # Apply product/brand bonus to shopping + brand_bonus = self._detect_product_brand_combo(query) + if brand_bonus > 0: + shopping_score += brand_bonus + shopping_matches.append({ + "pattern": "product_brand_combo", + "matched": "brand + product detected", + "weight": brand_bonus + }) + + # Detect URL β†’ strong Exa signal + detected_url = self._detect_url(query) + if detected_url: + discovery_score += 5.0 + discovery_matches.append({ + "pattern": "url_detected", + "matched": detected_url, + "weight": 5.0 + }) + + # Assess complexity β†’ favors Tavily + complexity = self._assess_query_complexity(query) + if complexity["is_complex"]: + research_score += complexity["complexity_score"] + research_matches.append({ + "pattern": "query_complexity", + "matched": f"complex query ({complexity['word_count']} words)", + "weight": complexity["complexity_score"] + }) + + # Check recency intent + is_recency, recency_score = self._detect_recency_intent(query) + + # Map intents to providers with final scores + provider_scores = { + "serper": shopping_score + local_news_score + (recency_score * 0.35), + "tavily": research_score + (complexity["complexity_score"] if not complexity["is_complex"] else 0) + (0.2 * recency_score), + "querit": (research_score * 0.65) + (rag_score * 0.35) + (recency_score * 0.45), + "exa": discovery_score + (1.0 if re.search(r"\b(similar|alternatives?|examples?)\b", query, re.IGNORECASE) else 0.0) + (exa_deep_score * 0.5) + (exa_deep_reasoning_score * 0.5), + "perplexity": direct_answer_score + (local_news_score * 0.4) + (recency_score * 0.55), + "you": rag_score + (recency_score * 0.25), # You.com good for real-time + RAG + "searxng": privacy_score, # SearXNG for privacy/multi-source queries + } + + # Build match details per provider + provider_matches = { + "serper": shopping_matches + local_news_matches, + "tavily": research_matches, + "querit": research_matches, + "exa": discovery_matches + exa_deep_matches + exa_deep_reasoning_matches, + "perplexity": direct_answer_matches, + "you": rag_matches, + "searxng": privacy_matches, + } + + return { + "query": query, + "provider_scores": provider_scores, + "provider_matches": provider_matches, + "detected_url": detected_url, + "complexity": complexity, + "recency_focused": is_recency, + "recency_score": recency_score, + "exa_deep_score": exa_deep_score, + "exa_deep_reasoning_score": exa_deep_reasoning_score, + } + + def route(self, query: str) -> Dict[str, Any]: + """ + Route query to optimal provider with confidence scoring. + """ + analysis = self.analyze(query) + scores = analysis["provider_scores"] + + # Filter to available providers + disabled = set(self.auto_config.get("disabled_providers", [])) + available = { + p: s for p, s in scores.items() + if p not in disabled and get_api_key(p, self.config) + } + + if not available: + # No providers available, use fallback + fallback = self.auto_config.get("fallback_provider", "serper") + return { + "provider": fallback, + "confidence": 0.0, + "confidence_level": "low", + "reason": "no_available_providers", + "scores": scores, + "top_signals": [], + "analysis": analysis, + } + + # Find the winner + max_score = max(available.values()) + total_score = sum(available.values()) or 1.0 + + # Handle ties using priority + priority = self.auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"]) + winners = [p for p, s in available.items() if s == max_score] + + if len(winners) > 1: + # Use priority to break tie + for p in priority: + if p in winners: + winner = p + break + else: + winner = winners[0] + else: + winner = winners[0] + + # Calculate confidence + # High confidence = clear winner with good margin + if max_score == 0: + confidence = 0.0 + reason = "no_signals_matched" + else: + # Confidence based on: + # 1. Absolute score (is it strong enough?) + # 2. Relative margin (is there a clear winner?) + second_best = sorted(available.values(), reverse=True)[1] if len(available) > 1 else 0 + margin = (max_score - second_best) / max_score if max_score > 0 else 0 + + # Normalize score to 0-1 range (assuming max reasonable score ~15) + normalized_score = min(max_score / 15.0, 1.0) + + # Confidence is combination of absolute strength and relative margin + confidence = round((normalized_score * 0.6 + margin * 0.4), 3) + + if confidence >= 0.7: + reason = "high_confidence_match" + elif confidence >= 0.4: + reason = "moderate_confidence_match" + else: + reason = "low_confidence_match" + + # Get top signals for the winning provider + matches = analysis["provider_matches"].get(winner, []) + top_signals = sorted(matches, key=lambda x: x["weight"], reverse=True)[:5] + + # Special case: URL detected and Exa available β†’ strong recommendation + if analysis["detected_url"] and "exa" in available: + if winner != "exa": + # Override if URL is present but didn't win + # (user might want similar search) + pass # Keep current winner but note it + + # Determine Exa search depth when routed to Exa + exa_depth = "normal" + if winner == "exa": + deep_r_score = analysis.get("exa_deep_reasoning_score", 0) + deep_score = analysis.get("exa_deep_score", 0) + if deep_r_score >= 4.0: + exa_depth = "deep-reasoning" + elif deep_score >= 4.0: + exa_depth = "deep" + + # Build detailed routing result + threshold = self.auto_config.get("confidence_threshold", 0.3) + + return { + "provider": winner, + "confidence": confidence, + "confidence_level": "high" if confidence >= 0.7 else "medium" if confidence >= 0.4 else "low", + "reason": reason, + "exa_depth": exa_depth, + "scores": {p: round(s, 2) for p, s in available.items()}, + "winning_score": round(max_score, 2), + "top_signals": [ + {"matched": s["matched"], "weight": s["weight"]} + for s in top_signals + ], + "below_threshold": confidence < threshold, + "analysis_summary": { + "query_length": len(query.split()), + "is_complex": analysis["complexity"]["is_complex"], + "has_url": analysis["detected_url"] is not None, + "recency_focused": analysis["recency_focused"], + } + } + + +def auto_route_provider(query: str, config: Dict[str, Any]) -> Dict[str, Any]: + """ + Intelligently route query to the best provider. + Returns detailed routing decision with confidence. + """ + analyzer = QueryAnalyzer(config) + return analyzer.route(query) + + +def explain_routing(query: str, config: Dict[str, Any]) -> Dict[str, Any]: + """ + Provide detailed explanation of routing decision for debugging. + """ + analyzer = QueryAnalyzer(config) + analysis = analyzer.analyze(query) + routing = analyzer.route(query) + + return { + "query": query, + "routing_decision": { + "provider": routing["provider"], + "confidence": routing["confidence"], + "confidence_level": routing["confidence_level"], + "reason": routing["reason"], + "exa_depth": routing.get("exa_depth", "normal"), + }, + "scores": routing["scores"], + "top_signals": routing["top_signals"], + "intent_breakdown": { + "shopping_signals": len(analysis["provider_matches"]["serper"]), + "research_signals": len(analysis["provider_matches"]["tavily"]), + "querit_signals": len(analysis["provider_matches"]["querit"]), + "discovery_signals": len(analysis["provider_matches"]["exa"]), + "rag_signals": len(analysis["provider_matches"]["you"]), + "exa_deep_score": round(analysis.get("exa_deep_score", 0), 2), + "exa_deep_reasoning_score": round(analysis.get("exa_deep_reasoning_score", 0), 2), + }, + "query_analysis": { + "word_count": analysis["complexity"]["word_count"], + "is_complex": analysis["complexity"]["is_complex"], + "complexity_score": round(analysis["complexity"]["complexity_score"], 2), + "has_url": analysis["detected_url"], + "recency_focused": analysis["recency_focused"], + }, + "all_matches": { + provider: [ + {"matched": m["matched"], "weight": m["weight"]} + for m in matches + ] + for provider, matches in analysis["provider_matches"].items() + if matches + }, + "available_providers": [ + p for p in ["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng"] + if get_api_key(p, config) and p not in config.get("auto_routing", {}).get("disabled_providers", []) + ] + } + + + + +class ProviderConfigError(Exception): + """Raised when a provider is missing or has an invalid API key/config.""" + pass + + +class ProviderRequestError(Exception): + """Structured provider error with retry/cooldown metadata.""" + + def __init__(self, message: str, status_code: Optional[int] = None, transient: bool = False): + super().__init__(message) + self.status_code = status_code + self.transient = transient + + +TRANSIENT_HTTP_CODES = {429, 503} +COOLDOWN_STEPS_SECONDS = [60, 300, 1500, 3600] # 1m -> 5m -> 25m -> 1h cap +RETRY_BACKOFF_SECONDS = [1, 3, 9] + + +def _ensure_parent(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def _load_provider_health() -> Dict[str, Any]: + if not PROVIDER_HEALTH_FILE.exists(): + return {} + try: + with open(PROVIDER_HEALTH_FILE, "r", encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, dict) else {} + except (json.JSONDecodeError, IOError): + return {} + + +def _save_provider_health(state: Dict[str, Any]) -> None: + _ensure_parent(PROVIDER_HEALTH_FILE) + with open(PROVIDER_HEALTH_FILE, "w", encoding="utf-8") as f: + json.dump(state, f, ensure_ascii=False, indent=2) + + +def provider_in_cooldown(provider: str) -> Tuple[bool, int]: + state = _load_provider_health() + pstate = state.get(provider, {}) + cooldown_until = int(pstate.get("cooldown_until", 0) or 0) + remaining = cooldown_until - int(time.time()) + return (remaining > 0, max(0, remaining)) + + +def mark_provider_failure(provider: str, error_message: str) -> Dict[str, Any]: + state = _load_provider_health() + now = int(time.time()) + pstate = state.get(provider, {}) + fail_count = int(pstate.get("failure_count", 0)) + 1 + cooldown_seconds = COOLDOWN_STEPS_SECONDS[min(fail_count - 1, len(COOLDOWN_STEPS_SECONDS) - 1)] + state[provider] = { + "failure_count": fail_count, + "cooldown_until": now + cooldown_seconds, + "cooldown_seconds": cooldown_seconds, + "last_error": error_message, + "last_failure_at": now, + } + _save_provider_health(state) + return state[provider] + + +def reset_provider_health(provider: str) -> None: + state = _load_provider_health() + if provider in state: + state.pop(provider, None) + _save_provider_health(state) + + +def _title_from_url(url: str) -> str: + """Derive a readable title from a URL when none is provided.""" + try: + parsed = urlparse(url) + domain = parsed.netloc.replace("www.", "") + # Use last meaningful path segment as context + segments = [s for s in parsed.path.strip("/").split("/") if s] + if segments: + last = segments[-1].replace("-", " ").replace("_", " ") + # Strip file extensions + last = re.sub(r'\.\w{2,4}$', '', last) + if last: + return f"{domain} β€” {last[:80]}" + return domain + except Exception: + return url[:60] + + +def normalize_result_url(url: str) -> str: + if not url: + return "" + parsed = urlparse(url.strip()) + netloc = (parsed.netloc or "").lower() + if netloc.startswith("www."): + netloc = netloc[4:] + path = parsed.path.rstrip("/") + return f"{netloc}{path}" + + +def deduplicate_results_across_providers(results_by_provider: List[Tuple[str, Dict[str, Any]]], max_results: int) -> Tuple[List[Dict[str, Any]], int]: + deduped = [] + seen = set() + dedup_count = 0 + for provider_name, data in results_by_provider: + for item in data.get("results", []): + norm = normalize_result_url(item.get("url", "")) + if norm and norm in seen: + dedup_count += 1 + continue + if norm: + seen.add(norm) + item = item.copy() + item.setdefault("provider", provider_name) + deduped.append(item) + if len(deduped) >= max_results: + return deduped, dedup_count + return deduped, dedup_count + +# ============================================================================= +# HTTP Client +# ============================================================================= + +def make_request(url: str, headers: dict, body: dict, timeout: int = 30) -> dict: + """Make HTTP POST request and return JSON response.""" + # Ensure User-Agent is set (required by some APIs like Exa/Cloudflare) + if "User-Agent" not in headers: + headers["User-Agent"] = "ClawdBot-WebSearchPlus/2.1" + data = json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method="POST") + + try: + with urlopen(req, timeout=timeout) as response: + return json.loads(response.read().decode("utf-8")) + except HTTPError as e: + error_body = e.read().decode("utf-8") if e.fp else str(e) + try: + error_json = json.loads(error_body) + error_detail = error_json.get("error") or error_json.get("message") or error_body + except json.JSONDecodeError: + error_detail = error_body[:500] + + error_messages = { + 401: "Invalid or expired API key. Please check your credentials.", + 403: "Access forbidden. Your API key may not have permission for this operation.", + 429: "Rate limit exceeded. Please wait a moment and try again.", + 500: "Server error. The search provider is experiencing issues.", + 503: "Service unavailable. The search provider may be down." + } + + friendly_msg = error_messages.get(e.code, f"API error: {error_detail}") + raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES) + except URLError as e: + reason = str(getattr(e, "reason", e)) + is_timeout = "timed out" in reason.lower() + raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout) + except IncompleteRead as e: + partial_len = len(getattr(e, "partial", b"") or b"") + raise ProviderRequestError( + f"Connection interrupted while reading response ({partial_len} bytes received). Please retry.", + transient=True, + ) + except TimeoutError: + raise ProviderRequestError(f"Request timed out after {timeout}s. Try again or reduce max_results.", transient=True) + + +# ============================================================================= +# Serper (Google Search API) +# ============================================================================= + +def search_serper( + query: str, + api_key: str, + max_results: int = 5, + country: str = "us", + language: str = "en", + search_type: str = "search", + time_range: Optional[str] = None, + include_images: bool = False, +) -> dict: + """Search using Serper (Google Search API).""" + endpoint = f"https://google.serper.dev/{search_type}" + + body = { + "q": query, + "gl": country, + "hl": language, + "num": max_results, + "autocorrect": True, + } + + if time_range and time_range != "none": + tbs_map = { + "hour": "qdr:h", + "day": "qdr:d", + "week": "qdr:w", + "month": "qdr:m", + "year": "qdr:y", + } + if time_range in tbs_map: + body["tbs"] = tbs_map[time_range] + + headers = { + "X-API-KEY": api_key, + "Content-Type": "application/json", + } + + data = make_request(endpoint, headers, body) + + results = [] + for i, item in enumerate(data.get("organic", [])[:max_results]): + results.append({ + "title": item.get("title", ""), + "url": item.get("link", ""), + "snippet": item.get("snippet", ""), + "score": round(1.0 - i * 0.1, 2), + "date": item.get("date"), + }) + + answer = "" + if data.get("answerBox", {}).get("answer"): + answer = data["answerBox"]["answer"] + elif data.get("answerBox", {}).get("snippet"): + answer = data["answerBox"]["snippet"] + elif data.get("knowledgeGraph", {}).get("description"): + answer = data["knowledgeGraph"]["description"] + elif results: + answer = results[0]["snippet"] + + images = [] + if include_images: + try: + img_data = make_request( + "https://google.serper.dev/images", + headers, + {"q": query, "gl": country, "hl": language, "num": 5}, + ) + images = [img.get("imageUrl", "") for img in img_data.get("images", [])[:5] if img.get("imageUrl")] + except Exception: + pass + + return { + "provider": "serper", + "query": query, + "results": results, + "images": images, + "answer": answer, + "knowledge_graph": data.get("knowledgeGraph"), + "related_searches": [r.get("query") for r in data.get("relatedSearches", [])] + } + + +# ============================================================================= +# Tavily (Research Search) +# ============================================================================= + +def search_tavily( + query: str, + api_key: str, + max_results: int = 5, + depth: str = "basic", + topic: str = "general", + include_domains: Optional[List[str]] = None, + exclude_domains: Optional[List[str]] = None, + include_images: bool = False, + include_raw_content: bool = False, +) -> dict: + """Search using Tavily (AI Research Search).""" + endpoint = "https://api.tavily.com/search" + + body = { + "api_key": api_key, + "query": query, + "max_results": max_results, + "search_depth": depth, + "topic": topic, + "include_images": include_images, + "include_answer": True, + "include_raw_content": include_raw_content, + } + + if include_domains: + body["include_domains"] = include_domains + if exclude_domains: + body["exclude_domains"] = exclude_domains + + headers = {"Content-Type": "application/json"} + + data = make_request(endpoint, headers, body) + + results = [] + for item in data.get("results", [])[:max_results]: + result = { + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": item.get("content", ""), + "score": round(item.get("score", 0.0), 3), + } + if include_raw_content and item.get("raw_content"): + result["raw_content"] = item["raw_content"] + results.append(result) + + return { + "provider": "tavily", + "query": query, + "results": results, + "images": data.get("images", []), + "answer": data.get("answer", ""), + } + + +# ============================================================================= +# Querit (Multi-lingual search API for AI, with rich metadata and real-time information) +# ============================================================================= + +def _map_querit_time_range(time_range: Optional[str]) -> Optional[str]: + """Map generic time ranges to Querit's compact date filter format.""" + if not time_range: + return None + return { + "day": "d1", + "week": "w1", + "month": "m1", + "year": "y1", + }.get(time_range, time_range) + + +def search_querit( + query: str, + api_key: str, + max_results: int = 5, + language: str = "en", + country: str = "us", + time_range: Optional[str] = None, + include_domains: Optional[List[str]] = None, + exclude_domains: Optional[List[str]] = None, + base_url: str = "https://api.querit.ai", + base_path: str = "/v1/search", + timeout: int = 30, +) -> dict: + """Search using Querit. + + Mirrors the Querit Python SDK payload shape: + - query + - count + - optional filters: languages, geo, sites, timeRange + """ + endpoint = base_url.rstrip("/") + base_path + + filters: Dict[str, Any] = {} + if language: + filters["languages"] = {"include": [language.lower()]} + if country: + filters["geo"] = {"countries": {"include": [country.upper()]}} + if include_domains or exclude_domains: + sites: Dict[str, List[str]] = {} + if include_domains: + sites["include"] = include_domains + if exclude_domains: + sites["exclude"] = exclude_domains + filters["sites"] = sites + + querit_time_range = _map_querit_time_range(time_range) + if querit_time_range: + filters["timeRange"] = {"date": querit_time_range} + + body: Dict[str, Any] = { + "query": query, + "count": max_results, + } + if filters: + body["filters"] = filters + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + data = make_request(endpoint, headers, body, timeout=timeout) + + error_code = data.get("error_code") + error_msg = data.get("error_msg") + if error_msg or (error_code not in (None, 0, 200)): + message = error_msg or f"Querit request failed with error_code={error_code}" + raise ProviderRequestError(message) + + raw_results = ((data.get("results") or {}).get("result")) or [] + results = [] + for i, item in enumerate(raw_results[:max_results]): + snippet = item.get("snippet") or item.get("page_age") or "" + result = { + "title": item.get("title") or _title_from_url(item.get("url", "")), + "url": item.get("url", ""), + "snippet": snippet, + "score": round(1.0 - i * 0.05, 3), + } + if item.get("page_time") is not None: + result["page_time"] = item["page_time"] + if item.get("page_age"): + result["date"] = item["page_age"] + if item.get("language") is not None: + result["language"] = item["language"] + results.append(result) + + answer = results[0]["snippet"] if results else "" + + return { + "provider": "querit", + "query": query, + "results": results, + "images": [], + "answer": answer, + "metadata": { + "search_id": data.get("search_id"), + "time_range": querit_time_range, + } + } + + +# ============================================================================= +# Exa (Neural/Semantic/Deep Search) +# ============================================================================= + +def search_exa( + query: str, + api_key: str, + max_results: int = 5, + search_type: str = "neural", + exa_depth: str = "normal", + category: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + similar_url: Optional[str] = None, + include_domains: Optional[List[str]] = None, + exclude_domains: Optional[List[str]] = None, + text_verbosity: str = "standard", +) -> dict: + """Search using Exa (Neural/Semantic/Deep Search). + + exa_depth controls synthesis level: + - "normal": standard search (neural/fast/auto/keyword/instant) + - "deep": multi-source synthesis with grounding (4-12s, $12/1k) + - "deep-reasoning": cross-reference reasoning with grounding (12-50s, $15/1k) + """ + is_deep = exa_depth in ("deep", "deep-reasoning") + + if similar_url: + # findSimilar does not support deep search types + endpoint = "https://api.exa.ai/findSimilar" + body: Dict[str, Any] = { + "url": similar_url, + "numResults": max_results, + "contents": { + "text": {"maxCharacters": 2000, "verbosity": text_verbosity}, + "highlights": {"numSentences": 3, "highlightsPerUrl": 2}, + }, + } + elif is_deep: + endpoint = "https://api.exa.ai/search" + body = { + "query": query, + "numResults": max_results, + "type": exa_depth, + "contents": { + "text": {"maxCharacters": 5000, "verbosity": "full"}, + }, + } + else: + endpoint = "https://api.exa.ai/search" + body = { + "query": query, + "numResults": max_results, + "type": search_type, + "contents": { + "text": {"maxCharacters": 2000, "verbosity": text_verbosity}, + "highlights": {"numSentences": 3, "highlightsPerUrl": 2}, + }, + } + + if category: + body["category"] = category + if start_date: + body["startPublishedDate"] = start_date + if end_date: + body["endPublishedDate"] = end_date + if include_domains: + body["includeDomains"] = include_domains + if exclude_domains: + body["excludeDomains"] = exclude_domains + + headers = { + "x-api-key": api_key, + "Content-Type": "application/json", + } + + timeout = 55 if is_deep else 30 + data = make_request(endpoint, headers, body, timeout=timeout) + + results = [] + + # Deep search: primary content in output field with grounding citations + if is_deep: + deep_output = data.get("output", {}) + synthesized_text = "" + grounding_citations: List[Dict[str, Any]] = [] + + if isinstance(deep_output.get("content"), str): + synthesized_text = deep_output["content"] + elif isinstance(deep_output.get("content"), dict): + synthesized_text = json.dumps(deep_output["content"], ensure_ascii=False) + + for field_citation in deep_output.get("grounding", []): + for cite in field_citation.get("citations", []): + grounding_citations.append({ + "url": cite.get("url", ""), + "title": cite.get("title", ""), + "confidence": field_citation.get("confidence", ""), + "field": field_citation.get("field", ""), + }) + + # Primary synthesized result + if synthesized_text: + results.append({ + "title": f"Exa {exa_depth.replace('-', ' ').title()} Synthesis", + "url": "", + "snippet": synthesized_text, + "full_synthesis": synthesized_text, + "score": 1.0, + "grounding": grounding_citations[:10], + "type": "synthesis", + }) + + # Supporting source documents + for item in data.get("results", [])[:max_results]: + text_content = item.get("text", "") or "" + highlights = item.get("highlights", []) + snippet = text_content[:800] if text_content else (highlights[0] if highlights else "") + results.append({ + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": snippet, + "score": round(item.get("score", 0.0), 3), + "published_date": item.get("publishedDate"), + "author": item.get("author"), + "type": "source", + }) + + answer = synthesized_text if synthesized_text else (results[1]["snippet"] if len(results) > 1 else "") + + return { + "provider": "exa", + "query": query, + "exa_depth": exa_depth, + "results": results, + "images": [], + "answer": answer, + "grounding": grounding_citations, + "metadata": { + "synthesis_length": len(synthesized_text), + "source_count": len(data.get("results", [])), + }, + } + + # Standard search result parsing + for item in data.get("results", [])[:max_results]: + text_content = item.get("text", "") or "" + highlights = item.get("highlights", []) + if text_content: + snippet = text_content[:800] + elif highlights: + snippet = " ... ".join(highlights[:2]) + else: + snippet = "" + + results.append({ + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": snippet, + "score": round(item.get("score", 0.0), 3), + "published_date": item.get("publishedDate"), + "author": item.get("author"), + }) + + answer = results[0]["snippet"] if results else "" + + return { + "provider": "exa", + "query": query if not similar_url else f"Similar to: {similar_url}", + "results": results, + "images": [], + "answer": answer, + } + + +# ============================================================================= +# Perplexity via Kilo Gateway (Synthesized Direct Answers) +# ============================================================================= + +def search_perplexity( + query: str, + api_key: str, + max_results: int = 5, + model: str = "perplexity/sonar-pro", + api_url: str = "https://api.kilo.ai/api/gateway/chat/completions", + freshness: Optional[str] = None, +) -> dict: + """Search/answer using Perplexity Sonar Pro via Kilo Gateway. + + Args: + query: Search query + api_key: Kilo Gateway API key + max_results: Maximum results to return + model: Perplexity model to use + api_url: Kilo Gateway endpoint + freshness: Filter by recency β€” 'day', 'week', 'month', 'year' (maps to + Perplexity's search_recency_filter parameter) + """ + # Map generic freshness values to Perplexity's search_recency_filter + recency_map = {"day": "day", "pd": "day", "week": "week", "pw": "week", "month": "month", "pm": "month", "year": "year", "py": "year"} + recency_filter = recency_map.get(freshness or "", None) + + body = { + "model": model, + "messages": [ + {"role": "system", "content": "Answer with concise factual summary and include source URLs."}, + {"role": "user", "content": query}, + ], + "temperature": 0.2, + } + if recency_filter: + body["search_recency_filter"] = recency_filter + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + data = make_request(api_url, headers, body) + choices = data.get("choices", []) + message = choices[0].get("message", {}) if choices else {} + answer = (message.get("content") or "").strip() + + # Prefer the structured citations array from Perplexity API response + api_citations = data.get("citations", []) + + # Fallback: extract URLs from answer text if API doesn't provide citations + if not api_citations: + api_citations = [] + seen = set() + for u in re.findall(r"https?://[^\s)\]}>\"']+", answer): + if u not in seen: + seen.add(u) + api_citations.append(u) + + results = [] + + # Primary result: the synthesized answer itself + if answer: + # Clean citation markers [1][2] for the snippet + clean_answer = re.sub(r'\[\d+\]', '', answer).strip() + results.append({ + "title": f"Perplexity Answer: {query[:80]}", + "url": "https://www.perplexity.ai", + "snippet": clean_answer[:500], + "score": 1.0, + }) + + # Source results from citations + for i, citation in enumerate(api_citations[:max_results - 1]): + # citations can be plain URL strings or dicts with url/title + if isinstance(citation, str): + url = citation + title = _title_from_url(url) + else: + url = citation.get("url", "") + title = citation.get("title") or _title_from_url(url) + results.append({ + "title": title, + "url": url, + "snippet": f"Source cited in Perplexity answer [citation {i+1}]", + "score": round(0.9 - i * 0.1, 3), + }) + + return { + "provider": "perplexity", + "query": query, + "results": results, + "images": [], + "answer": answer, + "metadata": { + "model": model, + "usage": data.get("usage", {}), + } + } + + + +# ============================================================================= +# You.com (LLM-Ready Web & News Search) +# ============================================================================= + +def search_you( + query: str, + api_key: str, + max_results: int = 5, + country: str = "US", + language: str = "en", + freshness: Optional[str] = None, + safesearch: str = "moderate", + include_news: bool = True, + livecrawl: Optional[str] = None, +) -> dict: + """Search using You.com (LLM-Ready Web & News Search). + + You.com excels at: + - RAG applications with pre-extracted snippets + - Combined web + news results in one call + - Real-time information with automatic news classification + - Clean, structured JSON optimized for AI consumption + + Args: + query: Search query + api_key: You.com API key + max_results: Maximum results to return (default 5, max 100) + country: ISO 3166-2 country code (e.g., US, GB, DE) + language: BCP 47 language code (e.g., en, de, fr) + freshness: Filter by recency: day, week, month, year, or YYYY-MM-DDtoYYYY-MM-DD + safesearch: Content filter: off, moderate (default), strict + include_news: Include news results when relevant (default True) + livecrawl: Fetch full page content: "web", "news", or "all" + """ + endpoint = "https://ydc-index.io/v1/search" + + # Build query parameters + params = { + "query": query, + "count": max_results, + "safesearch": safesearch, + } + + if country: + params["country"] = country.upper() + if language: + params["language"] = language.upper() + if freshness: + params["freshness"] = freshness + if livecrawl: + params["livecrawl"] = livecrawl + params["livecrawl_formats"] = "markdown" + + # Build URL with query params (URL-encode values) + query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items()) + url = f"{endpoint}?{query_string}" + + headers = { + "X-API-KEY": api_key, + "Accept": "application/json", + "User-Agent": "ClawdBot-WebSearchPlus/2.4", + } + + # Make GET request (You.com uses GET, not POST) + from urllib.request import Request, urlopen + req = Request(url, headers=headers, method="GET") + + try: + with urlopen(req, timeout=30) as response: + data = json.loads(response.read().decode("utf-8")) + except HTTPError as e: + error_body = e.read().decode("utf-8") if e.fp else str(e) + try: + error_json = json.loads(error_body) + error_detail = error_json.get("error") or error_json.get("message") or error_body + except json.JSONDecodeError: + error_detail = error_body[:500] + + error_messages = { + 401: "Invalid or expired API key. Get one at https://api.you.com", + 403: "Access forbidden. Check your API key permissions.", + 429: "Rate limit exceeded. Please wait and try again.", + 500: "You.com server error. Try again later.", + 503: "You.com service unavailable." + } + friendly_msg = error_messages.get(e.code, f"API error: {error_detail}") + raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES) + except URLError as e: + reason = str(getattr(e, "reason", e)) + is_timeout = "timed out" in reason.lower() + raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout) + except TimeoutError: + raise ProviderRequestError("You.com request timed out after 30s.", transient=True) + + # Parse results + results_data = data.get("results", {}) + web_results = results_data.get("web", []) + news_results = results_data.get("news", []) if include_news else [] + metadata = data.get("metadata", {}) + + # Normalize web results + results = [] + for i, item in enumerate(web_results[:max_results]): + snippets = item.get("snippets", []) + snippet = snippets[0] if snippets else item.get("description", "") + + result = { + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": snippet, + "score": round(1.0 - i * 0.05, 3), # Assign descending score + "date": item.get("page_age"), + "source": "web", + } + + # Include additional snippets if available (great for RAG) + if len(snippets) > 1: + result["additional_snippets"] = snippets[1:3] + + # Include thumbnail and favicon for UI display + if item.get("thumbnail_url"): + result["thumbnail"] = item["thumbnail_url"] + if item.get("favicon_url"): + result["favicon"] = item["favicon_url"] + + # Include live-crawled content if available + if item.get("contents"): + result["raw_content"] = item["contents"].get("markdown") or item["contents"].get("html", "") + + results.append(result) + + # Add news results (if any) + news = [] + for item in news_results[:5]: + news.append({ + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": item.get("description", ""), + "date": item.get("page_age"), + "thumbnail": item.get("thumbnail_url"), + "source": "news", + }) + + # Build answer from best snippets + answer = "" + if results: + # Combine top snippets for LLM context + top_snippets = [] + for r in results[:3]: + if r.get("snippet"): + top_snippets.append(r["snippet"]) + answer = " ".join(top_snippets)[:1000] + + return { + "provider": "you", + "query": query, + "results": results, + "news": news, + "images": [], + "answer": answer, + "metadata": { + "search_uuid": metadata.get("search_uuid"), + "latency": metadata.get("latency"), + } + } + + +# ============================================================================= +# SearXNG (Privacy-First Meta-Search) +# ============================================================================= + +def search_searxng( + query: str, + instance_url: str, + max_results: int = 5, + categories: Optional[List[str]] = None, + engines: Optional[List[str]] = None, + language: str = "en", + time_range: Optional[str] = None, + safesearch: int = 0, +) -> dict: + """Search using SearXNG (self-hosted privacy-first meta-search). + + SearXNG excels at: + - Privacy-preserving search (no tracking, no profiling) + - Multi-source aggregation (70+ upstream engines) + - $0 API cost (self-hosted) + - Diverse perspectives from multiple search engines + + Args: + query: Search query + instance_url: URL of your SearXNG instance (required) + max_results: Maximum results to return (default 5) + categories: Search categories (general, images, news, videos, etc.) + engines: Specific engines to use (google, bing, duckduckgo, etc.) + language: Language code (e.g., en, de, fr) + time_range: Filter by recency: day, week, month, year + safesearch: Content filter: 0=off, 1=moderate, 2=strict + + Note: + Requires a self-hosted SearXNG instance with JSON format enabled. + See: https://docs.searxng.org/admin/installation.html + """ + # Build URL with query parameters + params = { + "q": query, + "format": "json", + "language": language, + "safesearch": str(safesearch), + } + + if categories: + params["categories"] = ",".join(categories) + if engines: + params["engines"] = ",".join(engines) + if time_range: + params["time_range"] = time_range + + # Build URL β€” instance_url comes from operator-controlled config/env only + # (validated by _validate_searxng_url), not from agent/LLM input + base_url = instance_url.rstrip("/") + query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items()) + url = f"{base_url}/search?{query_string}" + + headers = { + "User-Agent": "ClawdBot-WebSearchPlus/2.5", + "Accept": "application/json", + } + + # Make GET request + req = Request(url, headers=headers, method="GET") + + try: + with urlopen(req, timeout=30) as response: + data = json.loads(response.read().decode("utf-8")) + except HTTPError as e: + error_body = e.read().decode("utf-8") if e.fp else str(e) + try: + error_json = json.loads(error_body) + error_detail = error_json.get("error") or error_json.get("message") or error_body + except json.JSONDecodeError: + error_detail = error_body[:500] + + error_messages = { + 403: "JSON API disabled on this SearXNG instance. Enable 'json' in search.formats in settings.yml", + 404: "SearXNG instance not found. Check your instance URL.", + 500: "SearXNG server error. Check instance health.", + 503: "SearXNG service unavailable." + } + friendly_msg = error_messages.get(e.code, f"SearXNG error: {error_detail}") + raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES) + except URLError as e: + reason = str(getattr(e, "reason", e)) + is_timeout = "timed out" in reason.lower() + raise ProviderRequestError(f"Cannot reach SearXNG instance at {instance_url}. Error: {reason}", transient=is_timeout) + except TimeoutError: + raise ProviderRequestError(f"SearXNG request timed out after 30s. Check instance health.", transient=True) + + # Parse results + raw_results = data.get("results", []) + + # Normalize results to unified format + results = [] + engines_used = set() + for i, item in enumerate(raw_results[:max_results]): + engine = item.get("engine", "unknown") + engines_used.add(engine) + + results.append({ + "title": item.get("title", ""), + "url": item.get("url", ""), + "snippet": item.get("content", ""), + "score": round(item.get("score", 1.0 - i * 0.05), 3), + "engine": engine, + "category": item.get("category", "general"), + "date": item.get("publishedDate"), + }) + + # Build answer from answers, infoboxes, or first result + answer = "" + if data.get("answers"): + answer = data["answers"][0] if isinstance(data["answers"][0], str) else str(data["answers"][0]) + elif data.get("infoboxes"): + infobox = data["infoboxes"][0] + answer = infobox.get("content", "") or infobox.get("infobox", "") + elif results: + answer = results[0]["snippet"] + + return { + "provider": "searxng", + "query": query, + "results": results, + "images": [], + "answer": answer, + "suggestions": data.get("suggestions", []), + "corrections": data.get("corrections", []), + "metadata": { + "number_of_results": data.get("number_of_results"), + "engines_used": list(engines_used), + "instance_url": instance_url, + } + } + + +# ============================================================================= +# CLI +# ============================================================================= + +def main(): + config = load_config() + + parser = argparse.ArgumentParser( + description="Web Search Plus β€” Intelligent multi-provider search with smart auto-routing", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Intelligent Auto-Routing: + The query is analyzed using multi-signal detection to find the optimal provider: + + Shopping Intent β†’ Serper (Google) + "how much", "price of", "buy", product+brand combos, deals, specs + + Research Intent β†’ Tavily + "how does", "explain", "what is", analysis, pros/cons, tutorials + + Multilingual + Real-Time AI Search β†’ Querit + multilingual search, metadata-rich results, current information for AI workflows + + Discovery Intent β†’ Exa (Neural) + "similar to", "companies like", "alternatives", URLs, startups, papers + + Direct Answer Intent β†’ Perplexity (via Kilo Gateway) + "what is", "current status", local events, synthesized up-to-date answers + +Examples: + python3 search.py -q "iPhone 16 Pro Max price" # β†’ Serper (shopping) + python3 search.py -q "how does HTTPS encryption work" # β†’ Tavily (research) + python3 search.py -q "startups similar to Notion" # β†’ Exa (discovery) + python3 search.py --explain-routing -q "your query" # Debug routing + +Full docs: See README.md and SKILL.md + """, + ) + + # Common arguments + parser.add_argument( + "--provider", "-p", + choices=["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng", "auto"], + help="Search provider (auto=intelligent routing)" + ) + parser.add_argument( + "--query", "-q", + help="Search query" + ) + parser.add_argument( + "--max-results", "-n", + type=int, + default=config.get("defaults", {}).get("max_results", 5), + help="Maximum results (default: 5)" + ) + parser.add_argument( + "--images", + action="store_true", + help="Include images (Serper/Tavily)" + ) + + # Auto-routing options + parser.add_argument( + "--auto", "-a", + action="store_true", + help="Use intelligent auto-routing (default when no provider specified)" + ) + parser.add_argument( + "--explain-routing", + action="store_true", + help="Show detailed routing analysis (debug mode)" + ) + + # Serper-specific + serper_config = config.get("serper", {}) + parser.add_argument("--country", default=serper_config.get("country", "us")) + parser.add_argument("--language", default=serper_config.get("language", "en")) + parser.add_argument( + "--type", + dest="search_type", + default=serper_config.get("type", "search"), + choices=["search", "news", "images", "videos", "places", "shopping"] + ) + parser.add_argument( + "--time-range", + choices=["hour", "day", "week", "month", "year"] + ) + + # Tavily-specific + tavily_config = config.get("tavily", {}) + parser.add_argument( + "--depth", + default=tavily_config.get("depth", "basic"), + choices=["basic", "advanced"] + ) + parser.add_argument( + "--topic", + default=tavily_config.get("topic", "general"), + choices=["general", "news"] + ) + parser.add_argument("--raw-content", action="store_true") + + # Querit-specific + querit_config = config.get("querit", {}) + parser.add_argument( + "--querit-base-url", + default=querit_config.get("base_url", "https://api.querit.ai"), + help="Querit API base URL" + ) + parser.add_argument( + "--querit-base-path", + default=querit_config.get("base_path", "/v1/search"), + help="Querit API path" + ) + + # Exa-specific + exa_config = config.get("exa", {}) + parser.add_argument( + "--exa-type", + default=exa_config.get("type", "neural"), + choices=["neural", "fast", "auto", "keyword", "instant"], + help="Exa search type (for standard search, ignored when --exa-depth is set)" + ) + parser.add_argument( + "--exa-depth", + default=exa_config.get("depth", "normal"), + choices=["normal", "deep", "deep-reasoning"], + help="Exa search depth: deep (synthesized, 4-12s), deep-reasoning (cross-reference, 12-50s)" + ) + parser.add_argument( + "--exa-verbosity", + default=exa_config.get("verbosity", "standard"), + choices=["compact", "standard", "full"], + help="Exa text verbosity for content extraction" + ) + parser.add_argument( + "--category", + choices=[ + "company", "research paper", "news", "pdf", "github", + "tweet", "personal site", "linkedin profile" + ] + ) + parser.add_argument("--start-date") + parser.add_argument("--end-date") + parser.add_argument("--similar-url") + + # You.com-specific + you_config = config.get("you", {}) + parser.add_argument( + "--you-safesearch", + default=you_config.get("safesearch", "moderate"), + choices=["off", "moderate", "strict"], + help="You.com SafeSearch filter" + ) + parser.add_argument( + "--freshness", + choices=["day", "week", "month", "year"], + help="Filter results by recency (You.com/Serper)" + ) + parser.add_argument( + "--livecrawl", + choices=["web", "news", "all"], + help="You.com: fetch full page content" + ) + parser.add_argument( + "--no-news", + action="store_true", + help="You.com: exclude news results (included by default)" + ) + + # SearXNG-specific + searxng_config = config.get("searxng", {}) + parser.add_argument( + "--searxng-url", + default=searxng_config.get("instance_url"), + help="SearXNG instance URL (e.g., https://searx.example.com)" + ) + parser.add_argument( + "--searxng-safesearch", + type=int, + default=searxng_config.get("safesearch", 0), + choices=[0, 1, 2], + help="SearXNG SafeSearch: 0=off, 1=moderate, 2=strict" + ) + parser.add_argument( + "--engines", + nargs="+", + default=searxng_config.get("engines"), + help="SearXNG: specific engines to use (e.g., google bing duckduckgo)" + ) + parser.add_argument( + "--categories", + nargs="+", + help="SearXNG: search categories (general, images, news, videos, etc.)" + ) + + # Domain filters + parser.add_argument("--include-domains", nargs="+") + parser.add_argument("--exclude-domains", nargs="+") + + # Output + parser.add_argument("--compact", action="store_true") + + # Caching options + parser.add_argument( + "--cache-ttl", + type=int, + default=DEFAULT_CACHE_TTL, + help=f"Cache TTL in seconds (default: {DEFAULT_CACHE_TTL} = 1 hour)" + ) + parser.add_argument( + "--no-cache", + action="store_true", + help="Bypass cache (always fetch fresh results)" + ) + parser.add_argument( + "--clear-cache", + action="store_true", + help="Clear all cached results and exit" + ) + parser.add_argument( + "--cache-stats", + action="store_true", + help="Show cache statistics and exit" + ) + + args = parser.parse_args() + + # Handle cache management commands first (before query validation) + if args.clear_cache: + result = cache_clear() + indent = None if args.compact else 2 + print(json.dumps(result, indent=indent, ensure_ascii=False)) + return + + if args.cache_stats: + result = cache_stats() + indent = None if args.compact else 2 + print(json.dumps(result, indent=indent, ensure_ascii=False)) + return + + if not args.query and not args.similar_url: + parser.error("--query is required (unless using --similar-url with Exa)") + + # Handle --explain-routing + if args.explain_routing: + if not args.query: + parser.error("--query is required for --explain-routing") + explanation = explain_routing(args.query, config) + indent = None if args.compact else 2 + print(json.dumps(explanation, indent=indent, ensure_ascii=False)) + return + + # Determine provider + if args.provider == "auto" or (args.provider is None and not args.similar_url): + if args.query: + routing = auto_route_provider(args.query, config) + provider = routing["provider"] + routing_info = { + "auto_routed": True, + "provider": provider, + "confidence": routing["confidence"], + "confidence_level": routing["confidence_level"], + "reason": routing["reason"], + "top_signals": routing["top_signals"], + "scores": routing["scores"], + } + else: + provider = "exa" + routing_info = { + "auto_routed": True, + "provider": "exa", + "confidence": 1.0, + "confidence_level": "high", + "reason": "similar_url_specified", + } + else: + provider = args.provider or "serper" + routing_info = {"auto_routed": False, "provider": provider} + + # Build provider fallback list + auto_config = config.get("auto_routing", {}) + provider_priority = auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"]) + disabled_providers = auto_config.get("disabled_providers", []) + + # Start with the selected provider, then try others in priority order + # Only include providers that have a configured API key (except the primary, + # which gets a clear error if unconfigured and no fallback succeeds) + providers_to_try = [provider] + for p in provider_priority: + if p not in providers_to_try and p not in disabled_providers and get_api_key(p, config): + providers_to_try.append(p) + + # Skip providers currently in cooldown + eligible_providers = [] + cooldown_skips = [] + for p in providers_to_try: + in_cd, remaining = provider_in_cooldown(p) + if in_cd: + cooldown_skips.append({"provider": p, "cooldown_remaining_seconds": remaining}) + else: + eligible_providers.append(p) + + if not eligible_providers: + eligible_providers = providers_to_try[:1] + + # Helper function to execute search for a provider + def execute_search(prov: str) -> Dict[str, Any]: + key = validate_api_key(prov, config) + if prov == "serper": + return search_serper( + query=args.query, + api_key=key, + max_results=args.max_results, + country=args.country, + language=args.language, + search_type=args.search_type, + time_range=args.time_range, + include_images=args.images, + ) + elif prov == "tavily": + return search_tavily( + query=args.query, + api_key=key, + max_results=args.max_results, + depth=args.depth, + topic=args.topic, + include_domains=args.include_domains, + exclude_domains=args.exclude_domains, + include_images=args.images, + include_raw_content=args.raw_content, + ) + elif prov == "querit": + return search_querit( + query=args.query, + api_key=key, + max_results=args.max_results, + language=args.language, + country=args.country, + time_range=args.time_range or args.freshness, + include_domains=args.include_domains, + exclude_domains=args.exclude_domains, + base_url=args.querit_base_url, + base_path=args.querit_base_path, + timeout=int(querit_config.get("timeout", 30)), + ) + elif prov == "exa": + # CLI --exa-depth overrides; fallback to auto-routing suggestion + exa_depth = args.exa_depth + if exa_depth == "normal" and routing_info.get("exa_depth") in ("deep", "deep-reasoning"): + exa_depth = routing_info["exa_depth"] + return search_exa( + query=args.query or "", + api_key=key, + max_results=args.max_results, + search_type=args.exa_type, + exa_depth=exa_depth, + category=args.category, + start_date=args.start_date, + end_date=args.end_date, + similar_url=args.similar_url, + include_domains=args.include_domains, + exclude_domains=args.exclude_domains, + text_verbosity=args.exa_verbosity, + ) + elif prov == "perplexity": + perplexity_config = config.get("perplexity", {}) + return search_perplexity( + query=args.query, + api_key=key, + max_results=args.max_results, + model=perplexity_config.get("model", "perplexity/sonar-pro"), + api_url=perplexity_config.get("api_url", "https://api.kilo.ai/api/gateway/chat/completions"), + freshness=getattr(args, "freshness", None), + ) + elif prov == "you": + return search_you( + query=args.query, + api_key=key, + max_results=args.max_results, + country=args.country, + language=args.language, + freshness=args.freshness, + safesearch=args.you_safesearch, + include_news=not args.no_news, + livecrawl=args.livecrawl, + ) + elif prov == "searxng": + # For SearXNG, 'key' is actually the instance URL + instance_url = args.searxng_url or key + if instance_url: + instance_url = _validate_searxng_url(instance_url) + return search_searxng( + query=args.query, + instance_url=instance_url, + max_results=args.max_results, + categories=args.categories, + engines=args.engines, + language=args.language, + time_range=args.time_range, + safesearch=args.searxng_safesearch, + ) + else: + raise ValueError(f"Unknown provider: {prov}") + + def execute_with_retry(prov: str) -> Dict[str, Any]: + last_error = None + for attempt in range(0, 3): + try: + return execute_search(prov) + except ProviderRequestError as e: + last_error = e + if e.status_code in {401, 403}: + break + if not e.transient: + break + if attempt < 2: + time.sleep(RETRY_BACKOFF_SECONDS[attempt]) + continue + break + except Exception as e: + last_error = e + break + raise last_error if last_error else Exception("Unknown provider execution error") + + cache_context = { + "locale": f"{args.country}:{args.language}", + "freshness": args.freshness, + "time_range": args.time_range, + "include_domains": sorted(args.include_domains) if args.include_domains else None, + "exclude_domains": sorted(args.exclude_domains) if args.exclude_domains else None, + "topic": args.topic, + "search_engines": sorted(args.engines) if args.engines else None, + "include_news": not args.no_news, + "search_type": args.search_type, + "exa_type": args.exa_type, + "exa_depth": args.exa_depth, + "exa_verbosity": args.exa_verbosity, + "category": args.category, + "similar_url": args.similar_url, + } + + # Check cache first (unless --no-cache is set) + cached_result = None + cache_hit = False + if not args.no_cache and args.query: + cached_result = cache_get( + query=args.query, + provider=provider, + max_results=args.max_results, + ttl=args.cache_ttl, + params=cache_context, + ) + if cached_result: + cache_hit = True + result = {k: v for k, v in cached_result.items() if not k.startswith("_cache_")} + result["cached"] = True + result["cache_age_seconds"] = int(time.time() - cached_result.get("_cache_timestamp", 0)) + + errors = [] + successful_provider = None + successful_results: List[Tuple[str, Dict[str, Any]]] = [] + result = None if not cache_hit else result + + for idx, current_provider in enumerate(eligible_providers): + if cache_hit: + successful_provider = provider + break + try: + provider_result = execute_with_retry(current_provider) + reset_provider_health(current_provider) + successful_results.append((current_provider, provider_result)) + successful_provider = current_provider + + # If we have enough results, stop. + if len(provider_result.get("results", [])) >= args.max_results: + break + + # Only continue collecting from lower-priority providers when fallback was needed. + if not errors: + break + except Exception as e: + error_msg = str(e) + cooldown_info = mark_provider_failure(current_provider, error_msg) + errors.append({ + "provider": current_provider, + "error": error_msg, + "cooldown_seconds": cooldown_info.get("cooldown_seconds"), + }) + if len(eligible_providers) > 1: + remaining = eligible_providers[idx + 1:] + if remaining: + print(json.dumps({ + "fallback": True, + "failed_provider": current_provider, + "error": error_msg, + "trying_next": remaining[0], + }), file=sys.stderr) + continue + + if successful_results: + if len(successful_results) == 1: + result = successful_results[0][1] + else: + primary = successful_results[0][1].copy() + deduped_results, dedup_count = deduplicate_results_across_providers(successful_results, args.max_results) + primary["results"] = deduped_results + primary["deduplicated"] = dedup_count > 0 + primary.setdefault("metadata", {}) + primary["metadata"]["dedup_count"] = dedup_count + primary["metadata"]["providers_merged"] = [p for p, _ in successful_results] + result = primary + + if result is not None: + if successful_provider != provider: + routing_info["fallback_used"] = True + routing_info["original_provider"] = provider + routing_info["provider"] = successful_provider + routing_info["fallback_errors"] = errors + + if cooldown_skips: + routing_info["cooldown_skips"] = cooldown_skips + + result["routing"] = routing_info + + if not cache_hit and not args.no_cache and args.query: + cache_put( + query=args.query, + provider=successful_provider or provider, + max_results=args.max_results, + result=result, + params=cache_context, + ) + + result["cached"] = bool(cache_hit) + if "deduplicated" not in result: + result["deduplicated"] = False + result.setdefault("metadata", {}) + result["metadata"].setdefault("dedup_count", 0) + + indent = None if args.compact else 2 + print(json.dumps(result, indent=indent, ensure_ascii=False)) + else: + error_result = { + "error": "All providers failed", + "provider": provider, + "query": args.query, + "routing": routing_info, + "provider_errors": errors, + "cooldown_skips": cooldown_skips, + } + print(json.dumps(error_result, indent=2), file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/setup.py b/scripts/setup.py new file mode 100644 index 0000000..ec89d44 --- /dev/null +++ b/scripts/setup.py @@ -0,0 +1,453 @@ +#!/usr/bin/env python3 +""" +Web Search Plus - Interactive Setup Wizard +========================================== + +Runs on first use (when no config.json exists) to configure providers and API keys. +Creates config.json with your settings. API keys are stored locally only. + +Usage: + python3 scripts/setup.py # Interactive setup + python3 scripts/setup.py --reset # Reset and reconfigure +""" + +import json +import os +import sys +from pathlib import Path + +# ANSI colors for terminal output +class Colors: + HEADER = '\033[95m' + BLUE = '\033[94m' + CYAN = '\033[96m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + DIM = '\033[2m' + RESET = '\033[0m' + +def color(text: str, c: str) -> str: + """Wrap text in color codes.""" + return f"{c}{text}{Colors.RESET}" + +def print_header(): + """Print the setup wizard header.""" + print() + print(color("╔════════════════════════════════════════════════════════════╗", Colors.CYAN)) + print(color("β•‘ πŸ” Web Search Plus - Setup Wizard β•‘", Colors.CYAN)) + print(color("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•", Colors.CYAN)) + print() + print(color("This wizard will help you configure your search providers.", Colors.DIM)) + print(color("API keys are stored locally in config.json (gitignored).", Colors.DIM)) + print() + +def print_provider_info(): + """Print information about each provider.""" + print(color("πŸ“š Available Providers:", Colors.BOLD)) + print() + + providers = [ + { + "name": "Serper", + "emoji": "πŸ”Ž", + "best_for": "Google results, shopping, local businesses, news", + "free_tier": "2,500 queries/month", + "signup": "https://serper.dev", + "strengths": ["Fastest response times", "Product prices & specs", "Knowledge Graph", "Local business data"] + }, + { + "name": "Tavily", + "emoji": "πŸ“–", + "best_for": "Research, explanations, in-depth analysis", + "free_tier": "1,000 queries/month", + "signup": "https://tavily.com", + "strengths": ["AI-synthesized answers", "Full page content", "Domain filtering", "Academic research"] + }, + { + "name": "Exa", + "emoji": "🧠", + "best_for": "Semantic search, finding similar content, discovery", + "free_tier": "1,000 queries/month", + "signup": "https://exa.ai", + "strengths": ["Neural/semantic understanding", "Similar page discovery", "Startup/company finder", "Date filtering"] + }, + { + "name": "You.com", + "emoji": "πŸ€–", + "best_for": "RAG applications, real-time info, LLM-ready snippets", + "free_tier": "Limited free tier", + "signup": "https://api.you.com", + "strengths": ["LLM-ready snippets", "Combined web + news", "Live page crawling", "Real-time information"] + }, + { + "name": "SearXNG", + "emoji": "πŸ”’", + "best_for": "Privacy-first search, multi-source aggregation, $0 API cost", + "free_tier": "FREE (self-hosted)", + "signup": "https://docs.searxng.org/admin/installation.html", + "strengths": ["Privacy-preserving (no tracking)", "70+ search engines", "Self-hosted = $0 API cost", "Diverse results"] + } + ] + + for p in providers: + print(f" {p['emoji']} {color(p['name'], Colors.BOLD)}") + print(f" Best for: {color(p['best_for'], Colors.GREEN)}") + print(f" Free tier: {p['free_tier']}") + print(f" Sign up: {color(p['signup'], Colors.BLUE)}") + print() + +def ask_yes_no(prompt: str, default: bool = True) -> bool: + """Ask a yes/no question.""" + suffix = "[Y/n]" if default else "[y/N]" + while True: + response = input(f"{prompt} {color(suffix, Colors.DIM)}: ").strip().lower() + if response == "": + return default + if response in ("y", "yes"): + return True + if response in ("n", "no"): + return False + print(color(" Please enter 'y' or 'n'", Colors.YELLOW)) + +def ask_choice(prompt: str, options: list, default: str = None) -> str: + """Ask user to choose from a list of options.""" + print(f"\n{prompt}") + for i, opt in enumerate(options, 1): + marker = color("β†’", Colors.GREEN) if opt == default else " " + print(f" {marker} {i}. {opt}") + + while True: + hint = f" [default: {default}]" if default else "" + response = input(f"Enter number (1-{len(options)}){color(hint, Colors.DIM)}: ").strip() + + if response == "" and default: + return default + + try: + idx = int(response) + if 1 <= idx <= len(options): + return options[idx - 1] + except ValueError: + pass + + print(color(f" Please enter a number between 1 and {len(options)}", Colors.YELLOW)) + +def ask_api_key(provider: str, signup_url: str) -> str: + """Ask for an API key with validation.""" + print() + print(f" {color(f'Get your {provider} API key:', Colors.DIM)} {color(signup_url, Colors.BLUE)}") + + while True: + key = input(f" Enter your {provider} API key: ").strip() + + if not key: + print(color(" ⚠️ No key entered. This provider will be disabled.", Colors.YELLOW)) + return None + + # Basic validation + if len(key) < 10: + print(color(" ⚠️ Key seems too short. Please check and try again.", Colors.YELLOW)) + continue + + # Mask key for confirmation + masked = key[:4] + "..." + key[-4:] if len(key) > 12 else key[:2] + "..." + print(color(f" βœ“ Key saved: {masked}", Colors.GREEN)) + return key + + +def ask_searxng_instance(docs_url: str) -> str: + """Ask for SearXNG instance URL with connection test.""" + print() + print(f" {color('SearXNG is self-hosted. You need your own instance.', Colors.DIM)}") + print(f" {color('Setup guide:', Colors.DIM)} {color(docs_url, Colors.BLUE)}") + print() + print(f" {color('Example URLs:', Colors.DIM)}") + print(f" β€’ http://localhost:8080 (local Docker)") + print(f" β€’ https://searx.your-domain.com (self-hosted)") + print() + + while True: + url = input(f" Enter your SearXNG instance URL: ").strip() + + if not url: + print(color(" ⚠️ No URL entered. SearXNG will be disabled.", Colors.YELLOW)) + return None + + # Basic URL validation + if not url.startswith(("http://", "https://")): + print(color(" ⚠️ URL must start with http:// or https://", Colors.YELLOW)) + continue + + # SSRF protection: validate URL before connecting + try: + import ipaddress + import socket + from urllib.parse import urlparse as _urlparse + _parsed = _urlparse(url) + _hostname = _parsed.hostname or "" + _blocked = {"169.254.169.254", "metadata.google.internal", "metadata.internal"} + if _hostname in _blocked: + print(color(f" ❌ Blocked: {_hostname} is a cloud metadata endpoint.", Colors.RED)) + continue + if not os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1": + _resolved = socket.getaddrinfo(_hostname, _parsed.port or 80, proto=socket.IPPROTO_TCP) + for _fam, _t, _p, _cn, _sa in _resolved: + _ip = ipaddress.ip_address(_sa[0]) + if _ip.is_loopback or _ip.is_private or _ip.is_link_local or _ip.is_reserved: + print(color(f" ❌ Blocked: {_hostname} resolves to private IP {_ip}.", Colors.RED)) + print(color(f" Set SEARXNG_ALLOW_PRIVATE=1 if intentional.", Colors.DIM)) + raise ValueError("private_ip") + except ValueError as _ve: + if str(_ve) == "private_ip": + continue + raise + except socket.gaierror: + print(color(f" ❌ Cannot resolve hostname: {_hostname}", Colors.RED)) + continue + + # Test connection + print(color(f" Testing connection to {url}...", Colors.DIM)) + try: + import urllib.request + import urllib.error + + test_url = f"{url.rstrip('/')}/search?q=test&format=json" + req = urllib.request.Request( + test_url, + headers={"User-Agent": "ClawdBot-WebSearchPlus/2.5", "Accept": "application/json"} + ) + + with urllib.request.urlopen(req, timeout=10) as response: + data = response.read().decode("utf-8") + import json + result = json.loads(data) + + # Check if it looks like SearXNG JSON response + if "results" in result or "query" in result: + print(color(f" βœ“ Connection successful! SearXNG instance is working.", Colors.GREEN)) + return url.rstrip("/") + else: + print(color(f" ⚠️ Connected but response doesn't look like SearXNG JSON.", Colors.YELLOW)) + if ask_yes_no(" Use this URL anyway?", default=False): + return url.rstrip("/") + + except urllib.error.HTTPError as e: + if e.code == 403: + print(color(f" ⚠️ JSON API is disabled (403 Forbidden).", Colors.YELLOW)) + print(color(f" Enable JSON in settings.yml: search.formats: [html, json]", Colors.DIM)) + else: + print(color(f" ⚠️ HTTP error: {e.code} {e.reason}", Colors.YELLOW)) + + if ask_yes_no(" Try a different URL?", default=True): + continue + return None + + except urllib.error.URLError as e: + print(color(f" ⚠️ Cannot reach instance: {e.reason}", Colors.YELLOW)) + if ask_yes_no(" Try a different URL?", default=True): + continue + return None + + except Exception as e: + print(color(f" ⚠️ Error: {e}", Colors.YELLOW)) + if ask_yes_no(" Try a different URL?", default=True): + continue + return None + +def ask_result_count() -> int: + """Ask for default result count.""" + options = ["3 (fast, minimal)", "5 (balanced - recommended)", "10 (comprehensive)"] + choice = ask_choice("Default number of results per search?", options, "5 (balanced - recommended)") + + if "3" in choice: + return 3 + elif "10" in choice: + return 10 + return 5 + +def run_setup(skill_dir: Path, force_reset: bool = False): + """Run the interactive setup wizard.""" + config_path = skill_dir / "config.json" + example_path = skill_dir / "config.example.json" + + # Check if config already exists + if config_path.exists() and not force_reset: + print(color("βœ“ config.json already exists!", Colors.GREEN)) + print() + if not ask_yes_no("Do you want to reconfigure?", default=False): + print(color("Setup cancelled. Your existing config is unchanged.", Colors.DIM)) + return False + print() + + print_header() + print_provider_info() + + # Load example config as base + if example_path.exists(): + with open(example_path) as f: + config = json.load(f) + else: + config = { + "defaults": {"provider": "serper", "max_results": 5}, + "auto_routing": {"enabled": True, "fallback_provider": "serper"}, + "serper": {}, + "tavily": {}, + "exa": {} + } + + # Remove any existing API keys from example + for provider in ["serper", "tavily", "exa"]: + if provider in config: + config[provider].pop("api_key", None) + + enabled_providers = [] + + # ===== Question 1: Which providers to enable ===== + print(color("─" * 60, Colors.DIM)) + print(color("\nπŸ“‹ Step 1: Choose Your Providers\n", Colors.BOLD)) + print("Select which search providers you want to enable.") + print(color("(You need at least one API key to use this skill)", Colors.DIM)) + print() + + providers_info = { + "serper": ("Serper", "https://serper.dev", "Google results, shopping, local"), + "tavily": ("Tavily", "https://tavily.com", "Research, explanations, analysis"), + "exa": ("Exa", "https://exa.ai", "Semantic search, similar content"), + "you": ("You.com", "https://api.you.com", "RAG applications, real-time info"), + "searxng": ("SearXNG", "https://docs.searxng.org/admin/installation.html", "Privacy-first, self-hosted, $0 cost") + } + + for provider, (name, url, desc) in providers_info.items(): + print(f" {color(name, Colors.BOLD)}: {desc}") + + # Special handling for SearXNG + if provider == "searxng": + print(color(" Note: SearXNG requires a self-hosted instance (no API key needed)", Colors.DIM)) + if ask_yes_no(f" Do you have a SearXNG instance?", default=False): + instance_url = ask_searxng_instance(url) + if instance_url: + if "searxng" not in config: + config["searxng"] = {} + config["searxng"]["instance_url"] = instance_url + enabled_providers.append(provider) + else: + print(color(f" β†’ {name} disabled (no instance URL)", Colors.DIM)) + else: + print(color(f" β†’ {name} skipped (no instance)", Colors.DIM)) + else: + if ask_yes_no(f" Enable {name}?", default=True): + # ===== Question 2: API key for each enabled provider ===== + api_key = ask_api_key(name, url) + if api_key: + config[provider]["api_key"] = api_key + enabled_providers.append(provider) + else: + print(color(f" β†’ {name} disabled (no API key)", Colors.DIM)) + else: + print(color(f" β†’ {name} disabled", Colors.DIM)) + print() + + if not enabled_providers: + print() + print(color("⚠️ No providers enabled!", Colors.RED)) + print("You need at least one API key to use web-search-plus.") + print("Run this setup again when you have an API key.") + return False + + # ===== Question 3: Default provider ===== + print(color("─" * 60, Colors.DIM)) + print(color("\nβš™οΈ Step 2: Default Settings\n", Colors.BOLD)) + + if len(enabled_providers) > 1: + default_provider = ask_choice( + "Which provider should be the default for general queries?", + enabled_providers, + enabled_providers[0] + ) + else: + default_provider = enabled_providers[0] + print(f"Default provider: {color(default_provider, Colors.GREEN)} (only one enabled)") + + config["defaults"]["provider"] = default_provider + config["auto_routing"]["fallback_provider"] = default_provider + + # ===== Question 4: Auto-routing ===== + print() + print(color("Auto-routing", Colors.BOLD) + " automatically picks the best provider for each query:") + print(color(" β€’ 'iPhone price' β†’ Serper (shopping intent)", Colors.DIM)) + print(color(" β€’ 'how does TCP work' β†’ Tavily (research intent)", Colors.DIM)) + print(color(" β€’ 'companies like Stripe' β†’ Exa (discovery intent)", Colors.DIM)) + print() + + auto_routing = ask_yes_no("Enable auto-routing?", default=True) + config["auto_routing"]["enabled"] = auto_routing + + if not auto_routing: + print(color(f" β†’ All queries will use {default_provider}", Colors.DIM)) + + # ===== Question 5: Result count ===== + print() + max_results = ask_result_count() + config["defaults"]["max_results"] = max_results + + # Set disabled providers + all_providers = ["serper", "tavily", "exa", "you", "searxng"] + disabled = [p for p in all_providers if p not in enabled_providers] + config["auto_routing"]["disabled_providers"] = disabled + + # ===== Save config ===== + print() + print(color("─" * 60, Colors.DIM)) + print(color("\nπŸ’Ύ Saving Configuration\n", Colors.BOLD)) + + with open(config_path, 'w') as f: + json.dump(config, f, indent=2) + + print(color(f"βœ“ Configuration saved to: {config_path}", Colors.GREEN)) + print() + + # ===== Summary ===== + print(color("πŸ“‹ Configuration Summary:", Colors.BOLD)) + print(f" Enabled providers: {', '.join(enabled_providers)}") + print(f" Default provider: {default_provider}") + print(f" Auto-routing: {'enabled' if auto_routing else 'disabled'}") + print(f" Results per search: {max_results}") + print() + + # ===== Test suggestion ===== + print(color("πŸš€ Ready to search! Try:", Colors.BOLD)) + print(color(f" python3 scripts/search.py -q \"your query here\"", Colors.CYAN)) + print() + + return True + +def check_first_run(skill_dir: Path) -> bool: + """Check if this is the first run (no config.json).""" + config_path = skill_dir / "config.json" + return not config_path.exists() + +def main(): + # Determine skill directory + script_path = Path(__file__).resolve() + skill_dir = script_path.parent.parent + + # Check for --reset flag + force_reset = "--reset" in sys.argv + + # Check for --check flag (just check if setup needed) + if "--check" in sys.argv: + if check_first_run(skill_dir): + print("Setup required: config.json not found") + sys.exit(1) + else: + print("Setup complete: config.json exists") + sys.exit(0) + + # Run setup + success = run_setup(skill_dir, force_reset) + sys.exit(0 if success else 1) + +if __name__ == "__main__": + main() diff --git a/test-auto-routing.sh b/test-auto-routing.sh new file mode 100644 index 0000000..8134fd4 --- /dev/null +++ b/test-auto-routing.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Test Auto-Routing Feature +# Tests various query types to verify routing works correctly + +# Load from environment or .env file +if [ -f .env ]; then + source .env +fi + +# Check required keys +if [ -z "$SERPER_API_KEY" ]; then + echo "Error: SERPER_API_KEY not set. Copy .env.example to .env and add your keys." + exit 1 +fi + +echo "Testing auto-routing..." +python3 scripts/search.py -q "buy iPhone 15 price" --auto +python3 scripts/search.py -q "how does quantum computing work" --auto +python3 scripts/search.py -q "companies like Stripe" --auto