2941 lines
103 KiB
Python
2941 lines
103 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Web Search Plus — Unified Multi-Provider Search with Intelligent Auto-Routing
|
|
Supports: Serper (Google), Tavily (Research), Querit (Multilingual AI Search),
|
|
Exa (Neural), Perplexity (Direct Answers)
|
|
|
|
Smart Routing uses multi-signal analysis:
|
|
- Query intent classification (shopping, research, discovery)
|
|
- Linguistic pattern detection (how much vs how does)
|
|
- Product/brand recognition
|
|
- URL detection
|
|
- Confidence scoring
|
|
|
|
Usage:
|
|
python3 search.py --query "..." # Auto-route based on query
|
|
python3 search.py --provider [serper|tavily|querit|exa] --query "..." [options]
|
|
|
|
Examples:
|
|
python3 search.py -q "iPhone 16 Pro price" # → Serper (shopping intent)
|
|
python3 search.py -q "how does quantum entanglement work" # → Tavily (research intent)
|
|
python3 search.py -q "startups similar to Notion" # → Exa (discovery intent)
|
|
"""
|
|
|
|
import argparse
|
|
from http.client import IncompleteRead
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional, List, Dict, Any, Tuple
|
|
from urllib.request import Request, urlopen
|
|
from urllib.error import HTTPError, URLError
|
|
from urllib.parse import quote, urlparse
|
|
|
|
|
|
# =============================================================================
|
|
# Result Caching
|
|
# =============================================================================
|
|
|
|
CACHE_DIR = Path(os.environ.get("WSP_CACHE_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".cache")))
|
|
PROVIDER_HEALTH_FILE = CACHE_DIR / "provider_health.json"
|
|
DEFAULT_CACHE_TTL = 3600 # 1 hour in seconds
|
|
|
|
|
|
def _build_cache_payload(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
"""Build normalized payload used for cache key hashing."""
|
|
payload = {
|
|
"query": query,
|
|
"provider": provider,
|
|
"max_results": max_results,
|
|
}
|
|
if params:
|
|
payload.update(params)
|
|
return payload
|
|
|
|
|
|
def _get_cache_key(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> str:
|
|
"""Generate a unique cache key from all relevant query parameters."""
|
|
payload = _build_cache_payload(query, provider, max_results, params)
|
|
key_string = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
|
|
return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32]
|
|
|
|
|
|
def _get_cache_path(cache_key: str) -> Path:
|
|
"""Get the file path for a cache entry."""
|
|
return CACHE_DIR / f"{cache_key}.json"
|
|
|
|
|
|
def _ensure_cache_dir() -> None:
|
|
"""Create cache directory if it doesn't exist."""
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def cache_get(query: str, provider: str, max_results: int, ttl: int = DEFAULT_CACHE_TTL, params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Retrieve cached search results if they exist and are not expired.
|
|
|
|
Args:
|
|
query: The search query
|
|
provider: The search provider
|
|
max_results: Maximum results requested
|
|
ttl: Time-to-live in seconds (default: 1 hour)
|
|
|
|
Returns:
|
|
Cached result dict or None if not found/expired
|
|
"""
|
|
cache_key = _get_cache_key(query, provider, max_results, params)
|
|
cache_path = _get_cache_path(cache_key)
|
|
|
|
if not cache_path.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(cache_path, "r", encoding="utf-8") as f:
|
|
cached = json.load(f)
|
|
|
|
cached_time = cached.get("_cache_timestamp", 0)
|
|
if time.time() - cached_time > ttl:
|
|
# Cache expired, remove it
|
|
cache_path.unlink(missing_ok=True)
|
|
return None
|
|
|
|
return cached
|
|
except (json.JSONDecodeError, IOError, KeyError):
|
|
# Corrupted cache file, remove it
|
|
cache_path.unlink(missing_ok=True)
|
|
return None
|
|
|
|
|
|
def cache_put(query: str, provider: str, max_results: int, result: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> None:
|
|
"""
|
|
Store search results in cache.
|
|
|
|
Args:
|
|
query: The search query
|
|
provider: The search provider
|
|
max_results: Maximum results requested
|
|
result: The search result to cache
|
|
"""
|
|
_ensure_cache_dir()
|
|
|
|
cache_key = _get_cache_key(query, provider, max_results, params)
|
|
cache_path = _get_cache_path(cache_key)
|
|
|
|
# Add cache metadata
|
|
cached_result = result.copy()
|
|
cached_result["_cache_timestamp"] = time.time()
|
|
cached_result["_cache_key"] = cache_key
|
|
cached_result["_cache_query"] = query
|
|
cached_result["_cache_provider"] = provider
|
|
cached_result["_cache_max_results"] = max_results
|
|
cached_result["_cache_params"] = params or {}
|
|
|
|
try:
|
|
with open(cache_path, "w", encoding="utf-8") as f:
|
|
json.dump(cached_result, f, ensure_ascii=False, indent=2)
|
|
except IOError as e:
|
|
# Non-fatal: log to stderr but don't fail
|
|
print(json.dumps({"cache_write_error": str(e)}), file=sys.stderr)
|
|
|
|
|
|
def cache_clear() -> Dict[str, Any]:
|
|
"""
|
|
Clear all cached results.
|
|
|
|
Returns:
|
|
Stats about what was cleared
|
|
"""
|
|
if not CACHE_DIR.exists():
|
|
return {"cleared": 0, "message": "Cache directory does not exist"}
|
|
|
|
count = 0
|
|
size_freed = 0
|
|
|
|
for cache_file in CACHE_DIR.glob("*.json"):
|
|
if cache_file.name == PROVIDER_HEALTH_FILE.name:
|
|
continue
|
|
try:
|
|
size_freed += cache_file.stat().st_size
|
|
cache_file.unlink()
|
|
count += 1
|
|
except IOError:
|
|
pass
|
|
|
|
return {
|
|
"cleared": count,
|
|
"size_freed_bytes": size_freed,
|
|
"size_freed_kb": round(size_freed / 1024, 2),
|
|
"message": f"Cleared {count} cached entries"
|
|
}
|
|
|
|
|
|
def cache_stats() -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about the cache.
|
|
|
|
Returns:
|
|
Dict with cache statistics
|
|
"""
|
|
if not CACHE_DIR.exists():
|
|
return {
|
|
"total_entries": 0,
|
|
"total_size_bytes": 0,
|
|
"total_size_kb": 0,
|
|
"oldest": None,
|
|
"newest": None,
|
|
"cache_dir": str(CACHE_DIR),
|
|
"exists": False
|
|
}
|
|
|
|
entries = [p for p in CACHE_DIR.glob("*.json") if p.name != PROVIDER_HEALTH_FILE.name]
|
|
total_size = 0
|
|
oldest_time = None
|
|
newest_time = None
|
|
oldest_query = None
|
|
newest_query = None
|
|
provider_counts = {}
|
|
|
|
for cache_file in entries:
|
|
try:
|
|
stat = cache_file.stat()
|
|
total_size += stat.st_size
|
|
|
|
with open(cache_file, "r", encoding="utf-8") as f:
|
|
cached = json.load(f)
|
|
|
|
ts = cached.get("_cache_timestamp", 0)
|
|
query = cached.get("_cache_query", "unknown")
|
|
provider = cached.get("_cache_provider", "unknown")
|
|
|
|
provider_counts[provider] = provider_counts.get(provider, 0) + 1
|
|
|
|
if oldest_time is None or ts < oldest_time:
|
|
oldest_time = ts
|
|
oldest_query = query
|
|
if newest_time is None or ts > newest_time:
|
|
newest_time = ts
|
|
newest_query = query
|
|
except (json.JSONDecodeError, IOError):
|
|
pass
|
|
|
|
return {
|
|
"total_entries": len(entries),
|
|
"total_size_bytes": total_size,
|
|
"total_size_kb": round(total_size / 1024, 2),
|
|
"providers": provider_counts,
|
|
"oldest": {
|
|
"timestamp": oldest_time,
|
|
"age_seconds": int(time.time() - oldest_time) if oldest_time else None,
|
|
"query": oldest_query
|
|
} if oldest_time else None,
|
|
"newest": {
|
|
"timestamp": newest_time,
|
|
"age_seconds": int(time.time() - newest_time) if newest_time else None,
|
|
"query": newest_query
|
|
} if newest_time else None,
|
|
"cache_dir": str(CACHE_DIR),
|
|
"exists": True
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Auto-load .env from skill directory (if exists)
|
|
# =============================================================================
|
|
def _load_env_file():
|
|
"""Load .env file from skill root directory if it exists."""
|
|
env_path = Path(__file__).parent.parent / ".env"
|
|
if env_path.exists():
|
|
with open(env_path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line and not line.startswith("#") and "=" in line:
|
|
# Handle export VAR=value or VAR=value
|
|
if line.startswith("export "):
|
|
line = line[7:]
|
|
key, _, value = line.partition("=")
|
|
key = key.strip()
|
|
value = value.strip().strip('"').strip("'")
|
|
if key and key not in os.environ:
|
|
os.environ[key] = value
|
|
|
|
_load_env_file()
|
|
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
DEFAULT_CONFIG = {
|
|
"defaults": {
|
|
"provider": "serper",
|
|
"max_results": 5
|
|
},
|
|
"auto_routing": {
|
|
"enabled": True,
|
|
"fallback_provider": "serper",
|
|
"provider_priority": ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"],
|
|
"disabled_providers": [],
|
|
"confidence_threshold": 0.3, # Below this, note low confidence
|
|
},
|
|
"serper": {
|
|
"country": "us",
|
|
"language": "en",
|
|
"type": "search"
|
|
},
|
|
"tavily": {
|
|
"depth": "basic",
|
|
"topic": "general"
|
|
},
|
|
"querit": {
|
|
"base_url": "https://api.querit.ai",
|
|
"base_path": "/v1/search",
|
|
"timeout": 10
|
|
},
|
|
"exa": {
|
|
"type": "neural",
|
|
"depth": "normal",
|
|
"verbosity": "standard"
|
|
},
|
|
"perplexity": {
|
|
"api_url": "https://api.kilo.ai/api/gateway/chat/completions",
|
|
"model": "perplexity/sonar-pro"
|
|
},
|
|
"you": {
|
|
"country": "us",
|
|
"safesearch": "moderate"
|
|
},
|
|
"searxng": {
|
|
"instance_url": None, # Required - user must set their own instance
|
|
"safesearch": 0, # 0=off, 1=moderate, 2=strict
|
|
"engines": None, # Optional list of engines to use
|
|
"language": "en"
|
|
}
|
|
}
|
|
|
|
|
|
def load_config() -> Dict[str, Any]:
|
|
"""Load configuration from config.json if it exists, with defaults."""
|
|
config = DEFAULT_CONFIG.copy()
|
|
config_path = Path(__file__).parent.parent / "config.json"
|
|
|
|
if config_path.exists():
|
|
try:
|
|
with open(config_path) as f:
|
|
user_config = json.load(f)
|
|
for key, value in user_config.items():
|
|
if isinstance(value, dict) and key in config:
|
|
config[key] = {**config.get(key, {}), **value}
|
|
else:
|
|
config[key] = value
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
print(json.dumps({
|
|
"warning": f"Could not load config.json: {e}",
|
|
"using": "default configuration"
|
|
}), file=sys.stderr)
|
|
|
|
return config
|
|
|
|
|
|
def get_api_key(provider: str, config: Dict[str, Any] = None) -> Optional[str]:
|
|
"""Get API key for provider from config.json or environment.
|
|
|
|
Priority: config.json > .env > environment variable
|
|
|
|
Note: SearXNG doesn't require an API key, but returns instance_url if configured.
|
|
"""
|
|
# Special case: SearXNG uses instance_url instead of API key
|
|
if provider == "searxng":
|
|
return get_searxng_instance_url(config)
|
|
|
|
# Check config.json first
|
|
if config:
|
|
provider_config = config.get(provider, {})
|
|
if isinstance(provider_config, dict):
|
|
key = provider_config.get("api_key") or provider_config.get("apiKey")
|
|
if key:
|
|
return key
|
|
|
|
# Then check environment
|
|
if provider == "perplexity":
|
|
return os.environ.get("PERPLEXITY_API_KEY") or os.environ.get("KILOCODE_API_KEY")
|
|
key_map = {
|
|
"serper": "SERPER_API_KEY",
|
|
"tavily": "TAVILY_API_KEY",
|
|
"querit": "QUERIT_API_KEY",
|
|
"exa": "EXA_API_KEY",
|
|
"you": "YOU_API_KEY",
|
|
}
|
|
return os.environ.get(key_map.get(provider, ""))
|
|
|
|
|
|
def _validate_searxng_url(url: str) -> str:
|
|
"""Validate and sanitize SearXNG instance URL to prevent SSRF.
|
|
|
|
Enforces http/https scheme and blocks requests to private/internal networks
|
|
including cloud metadata endpoints, loopback, link-local, and RFC1918 ranges.
|
|
"""
|
|
import ipaddress
|
|
import socket
|
|
from urllib.parse import urlparse
|
|
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ("http", "https"):
|
|
raise ValueError(f"SearXNG URL must use http or https scheme, got: {parsed.scheme}")
|
|
if not parsed.hostname:
|
|
raise ValueError("SearXNG URL must include a hostname")
|
|
|
|
hostname = parsed.hostname
|
|
|
|
# Block cloud metadata endpoints by hostname
|
|
BLOCKED_HOSTS = {
|
|
"169.254.169.254", # AWS/GCP/Azure metadata
|
|
"metadata.google.internal",
|
|
"metadata.internal",
|
|
}
|
|
if hostname in BLOCKED_HOSTS:
|
|
raise ValueError(f"SearXNG URL blocked: {hostname} is a cloud metadata endpoint")
|
|
|
|
# Resolve hostname and check for private/internal IPs
|
|
# Operators who intentionally self-host on private networks can opt out
|
|
allow_private = os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1"
|
|
if not allow_private:
|
|
try:
|
|
resolved_ips = socket.getaddrinfo(hostname, parsed.port or 80, proto=socket.IPPROTO_TCP)
|
|
for family, _type, _proto, _canonname, sockaddr in resolved_ips:
|
|
ip = ipaddress.ip_address(sockaddr[0])
|
|
if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
|
|
raise ValueError(
|
|
f"SearXNG URL blocked: {hostname} resolves to private/internal IP {ip}. "
|
|
f"If this is intentional, set SEARXNG_ALLOW_PRIVATE=1 in your environment."
|
|
)
|
|
except socket.gaierror:
|
|
raise ValueError(f"SearXNG URL blocked: cannot resolve hostname {hostname}")
|
|
|
|
return url
|
|
|
|
|
|
def get_searxng_instance_url(config: Dict[str, Any] = None) -> Optional[str]:
|
|
"""Get SearXNG instance URL from config or environment.
|
|
|
|
SearXNG is self-hosted, so no API key needed - just the instance URL.
|
|
Priority: config.json > SEARXNG_INSTANCE_URL environment variable
|
|
|
|
Security: URL is validated to prevent SSRF via scheme enforcement.
|
|
Both config sources (config.json, env var) are operator-controlled,
|
|
not agent-controlled, so private IPs like localhost are permitted.
|
|
"""
|
|
# Check config.json first
|
|
if config:
|
|
searxng_config = config.get("searxng", {})
|
|
if isinstance(searxng_config, dict):
|
|
url = searxng_config.get("instance_url")
|
|
if url:
|
|
return _validate_searxng_url(url)
|
|
|
|
# Then check environment
|
|
env_url = os.environ.get("SEARXNG_INSTANCE_URL")
|
|
if env_url:
|
|
return _validate_searxng_url(env_url)
|
|
return None
|
|
|
|
|
|
# Backward compatibility alias
|
|
def get_env_key(provider: str) -> Optional[str]:
|
|
"""Get API key for provider from environment (legacy function)."""
|
|
return get_api_key(provider)
|
|
|
|
|
|
def validate_api_key(provider: str, config: Dict[str, Any] = None) -> str:
|
|
"""Validate and return API key (or instance URL for SearXNG), with helpful error messages."""
|
|
key = get_api_key(provider, config)
|
|
|
|
# Special handling for SearXNG - it needs instance URL, not API key
|
|
if provider == "searxng":
|
|
if not key:
|
|
error_msg = {
|
|
"error": "Missing SearXNG instance URL",
|
|
"env_var": "SEARXNG_INSTANCE_URL",
|
|
"how_to_fix": [
|
|
"1. Set up your own SearXNG instance: https://docs.searxng.org/admin/installation.html",
|
|
"2. Add to config.json: \"searxng\": {\"instance_url\": \"https://your-instance.example.com\"}",
|
|
"3. Or set environment variable: export SEARXNG_INSTANCE_URL=\"https://your-instance.example.com\"",
|
|
"Note: SearXNG requires a self-hosted instance with JSON format enabled.",
|
|
],
|
|
"provider": provider
|
|
}
|
|
raise ProviderConfigError(json.dumps(error_msg))
|
|
|
|
# Validate URL format
|
|
if not key.startswith(("http://", "https://")):
|
|
raise ProviderConfigError(json.dumps({
|
|
"error": "SearXNG instance URL must start with http:// or https://",
|
|
"provided": key,
|
|
"provider": provider
|
|
}))
|
|
|
|
return key
|
|
|
|
if not key:
|
|
env_var = {
|
|
"serper": "SERPER_API_KEY",
|
|
"tavily": "TAVILY_API_KEY",
|
|
"querit": "QUERIT_API_KEY",
|
|
"exa": "EXA_API_KEY",
|
|
"you": "YOU_API_KEY",
|
|
"perplexity": "KILOCODE_API_KEY"
|
|
}[provider]
|
|
|
|
urls = {
|
|
"serper": "https://serper.dev",
|
|
"tavily": "https://tavily.com",
|
|
"querit": "https://querit.ai",
|
|
"exa": "https://exa.ai",
|
|
"you": "https://api.you.com",
|
|
"perplexity": "https://api.kilo.ai"
|
|
}
|
|
|
|
error_msg = {
|
|
"error": f"Missing API key for {provider}",
|
|
"env_var": env_var,
|
|
"how_to_fix": [
|
|
f"1. Get your API key from {urls[provider]}",
|
|
f"2. Add to config.json: \"{provider}\": {{\"api_key\": \"your-key\"}}",
|
|
f"3. Or set environment variable: export {env_var}=\"your-key\"",
|
|
],
|
|
"provider": provider
|
|
}
|
|
raise ProviderConfigError(json.dumps(error_msg))
|
|
|
|
if len(key) < 10:
|
|
raise ProviderConfigError(json.dumps({
|
|
"error": f"API key for {provider} appears invalid (too short)",
|
|
"provider": provider
|
|
}))
|
|
|
|
return key
|
|
|
|
|
|
# =============================================================================
|
|
# Intelligent Auto-Routing Engine
|
|
# =============================================================================
|
|
|
|
class QueryAnalyzer:
|
|
"""
|
|
Intelligent query analysis for smart provider routing.
|
|
|
|
Uses multi-signal analysis:
|
|
- Intent classification (shopping, research, discovery, local, news)
|
|
- Linguistic patterns (question structure, phrase patterns)
|
|
- Entity detection (products, brands, URLs, dates)
|
|
- Complexity assessment
|
|
"""
|
|
|
|
# Intent signal patterns with weights
|
|
# Higher weight = stronger signal for that provider
|
|
|
|
SHOPPING_SIGNALS = {
|
|
# Price patterns (very strong)
|
|
r'\bhow much\b': 4.0,
|
|
r'\bprice of\b': 4.0,
|
|
r'\bcost of\b': 4.0,
|
|
r'\bprices?\b': 3.0,
|
|
r'\$\d+|\d+\s*dollars?': 3.0,
|
|
r'€\d+|\d+\s*euros?': 3.0,
|
|
r'£\d+|\d+\s*pounds?': 3.0,
|
|
|
|
# German price patterns (sehr stark)
|
|
r'\bpreis(e)?\b': 3.5,
|
|
r'\bkosten\b': 3.0,
|
|
r'\bwieviel\b': 3.5,
|
|
r'\bwie viel\b': 3.5,
|
|
r'\bwas kostet\b': 4.0,
|
|
|
|
# Purchase intent (strong)
|
|
r'\bbuy\b': 3.5,
|
|
r'\bpurchase\b': 3.5,
|
|
r'\border\b(?!\s+by)': 3.0, # "order" but not "order by"
|
|
r'\bshopping\b': 3.5,
|
|
r'\bshop for\b': 3.5,
|
|
r'\bwhere to (buy|get|purchase)\b': 4.0,
|
|
|
|
# German purchase intent (stark)
|
|
r'\bkaufen\b': 3.5,
|
|
r'\bbestellen\b': 3.5,
|
|
r'\bwo kaufen\b': 4.0,
|
|
r'\bhändler\b': 3.0,
|
|
r'\bshop\b': 2.5,
|
|
|
|
# Deal/discount signals
|
|
r'\bdeal(s)?\b': 3.0,
|
|
r'\bdiscount(s)?\b': 3.0,
|
|
r'\bsale\b': 2.5,
|
|
r'\bcheap(er|est)?\b': 3.0,
|
|
r'\baffordable\b': 2.5,
|
|
r'\bbudget\b': 2.5,
|
|
r'\bbest price\b': 3.5,
|
|
r'\bcompare prices\b': 3.5,
|
|
r'\bcoupon\b': 3.0,
|
|
|
|
# German deal/discount signals
|
|
r'\bgünstig(er|ste)?\b': 3.0,
|
|
r'\bbillig(er|ste)?\b': 3.0,
|
|
r'\bangebot(e)?\b': 3.0,
|
|
r'\brabatt\b': 3.0,
|
|
r'\baktion\b': 2.5,
|
|
r'\bschnäppchen\b': 3.0,
|
|
|
|
# Product comparison
|
|
r'\bvs\.?\b': 2.0,
|
|
r'\bversus\b': 2.0,
|
|
r'\bor\b.*\bwhich\b': 2.0,
|
|
r'\bspecs?\b': 2.5,
|
|
r'\bspecifications?\b': 2.5,
|
|
r'\breview(s)?\b': 2.0,
|
|
r'\brating(s)?\b': 2.0,
|
|
r'\bunboxing\b': 2.5,
|
|
|
|
# German product comparison
|
|
r'\btest\b': 2.5,
|
|
r'\bbewertung(en)?\b': 2.5,
|
|
r'\btechnische daten\b': 3.0,
|
|
r'\bspezifikationen\b': 2.5,
|
|
}
|
|
|
|
RESEARCH_SIGNALS = {
|
|
# Explanation patterns (very strong)
|
|
r'\bhow does\b': 4.0,
|
|
r'\bhow do\b': 3.5,
|
|
r'\bwhy does\b': 4.0,
|
|
r'\bwhy do\b': 3.5,
|
|
r'\bwhy is\b': 3.5,
|
|
r'\bexplain\b': 4.0,
|
|
r'\bexplanation\b': 4.0,
|
|
r'\bwhat is\b': 3.0,
|
|
r'\bwhat are\b': 3.0,
|
|
r'\bdefine\b': 3.5,
|
|
r'\bdefinition of\b': 3.5,
|
|
r'\bmeaning of\b': 3.0,
|
|
|
|
# Analysis patterns (strong)
|
|
r'\banalyze\b': 3.5,
|
|
r'\banalysis\b': 3.5,
|
|
r'\bcompare\b(?!\s*prices?)': 3.0, # compare but not "compare prices"
|
|
r'\bcomparison\b': 3.0,
|
|
r'\bstatus of\b': 3.5,
|
|
r'\bstatus\b': 2.5,
|
|
r'\bwhat happened with\b': 4.0,
|
|
r'\bpros and cons\b': 4.0,
|
|
r'\badvantages?\b': 3.0,
|
|
r'\bdisadvantages?\b': 3.0,
|
|
r'\bbenefits?\b': 2.5,
|
|
r'\bdrawbacks?\b': 3.0,
|
|
r'\bdifference between\b': 3.5,
|
|
|
|
# Learning patterns
|
|
r'\bunderstand\b': 3.0,
|
|
r'\blearn(ing)?\b': 2.5,
|
|
r'\btutorial\b': 3.0,
|
|
r'\bguide\b': 2.5,
|
|
r'\bhow to\b': 2.0, # Lower weight - could be shopping too
|
|
r'\bstep by step\b': 3.0,
|
|
|
|
# Depth signals
|
|
r'\bin[- ]depth\b': 3.0,
|
|
r'\bdetailed\b': 2.5,
|
|
r'\bcomprehensive\b': 3.0,
|
|
r'\bthorough\b': 2.5,
|
|
r'\bdeep dive\b': 3.5,
|
|
r'\boverall\b': 2.0,
|
|
r'\bsummary\b': 2.0,
|
|
|
|
# Academic patterns
|
|
r'\bstudy\b': 2.5,
|
|
r'\bresearch shows\b': 3.5,
|
|
r'\baccording to\b': 2.5,
|
|
r'\bevidence\b': 3.0,
|
|
r'\bscientific\b': 3.0,
|
|
r'\bhistory of\b': 3.0,
|
|
r'\bbackground\b': 2.5,
|
|
r'\bcontext\b': 2.5,
|
|
r'\bimplications?\b': 3.0,
|
|
|
|
# German explanation patterns (sehr stark)
|
|
r'\bwie funktioniert\b': 4.0,
|
|
r'\bwarum\b': 3.5,
|
|
r'\berklär(en|ung)?\b': 4.0,
|
|
r'\bwas ist\b': 3.0,
|
|
r'\bwas sind\b': 3.0,
|
|
r'\bbedeutung\b': 3.0,
|
|
|
|
# German analysis patterns
|
|
r'\banalyse\b': 3.5,
|
|
r'\bvergleich(en)?\b': 3.0,
|
|
r'\bvor- und nachteile\b': 4.0,
|
|
r'\bvorteile\b': 3.0,
|
|
r'\bnachteile\b': 3.0,
|
|
r'\bunterschied(e)?\b': 3.5,
|
|
|
|
# German learning patterns
|
|
r'\bverstehen\b': 3.0,
|
|
r'\blernen\b': 2.5,
|
|
r'\banleitung\b': 3.0,
|
|
r'\bübersicht\b': 2.5,
|
|
r'\bhintergrund\b': 2.5,
|
|
r'\bzusammenfassung\b': 2.5,
|
|
}
|
|
|
|
DISCOVERY_SIGNALS = {
|
|
# Similarity patterns (very strong)
|
|
r'\bsimilar to\b': 5.0,
|
|
r'\blike\s+\w+\.com': 4.5, # "like notion.com"
|
|
r'\balternatives? to\b': 5.0,
|
|
r'\bcompetitors? (of|to)\b': 4.5,
|
|
r'\bcompeting with\b': 4.0,
|
|
r'\brivals? (of|to)\b': 4.0,
|
|
r'\binstead of\b': 3.0,
|
|
r'\breplacement for\b': 3.5,
|
|
|
|
# Company/startup patterns (strong)
|
|
r'\bcompanies (like|that|doing|building)\b': 4.5,
|
|
r'\bstartups? (like|that|doing|building)\b': 4.5,
|
|
r'\bwho else\b': 4.0,
|
|
r'\bother (companies|startups|tools|apps)\b': 3.5,
|
|
r'\bfind (companies|startups|tools|examples?)\b': 4.5,
|
|
r'\bevents? in\b': 4.0,
|
|
r'\bthings to do in\b': 4.5,
|
|
|
|
# Funding/business patterns
|
|
r'\bseries [a-d]\b': 4.0,
|
|
r'\byc\b|y combinator': 4.0,
|
|
r'\bfund(ed|ing|raise)\b': 3.5,
|
|
r'\bventure\b': 3.0,
|
|
r'\bvaluation\b': 3.0,
|
|
|
|
# Category patterns
|
|
r'\bresearch papers? (on|about)\b': 4.0,
|
|
r'\barxiv\b': 4.5,
|
|
r'\bgithub (projects?|repos?)\b': 4.5,
|
|
r'\bopen source\b.*\bprojects?\b': 4.0,
|
|
r'\btweets? (about|on)\b': 3.5,
|
|
r'\bblogs? (about|on|like)\b': 3.0,
|
|
|
|
# URL detection (very strong signal for Exa similar)
|
|
r'https?://[^\s]+': 5.0,
|
|
r'\b\w+\.(com|org|io|ai|co|dev)\b': 3.5,
|
|
}
|
|
|
|
LOCAL_NEWS_SIGNALS = {
|
|
# Local patterns → Serper
|
|
r'\bnear me\b': 4.0,
|
|
r'\bnearby\b': 3.5,
|
|
r'\blocal\b': 3.0,
|
|
r'\bin (my )?(city|area|town|neighborhood)\b': 3.5,
|
|
r'\brestaurants?\b': 2.5,
|
|
r'\bhotels?\b': 2.5,
|
|
r'\bcafes?\b': 2.5,
|
|
r'\bstores?\b': 2.0,
|
|
r'\bdirections? to\b': 3.5,
|
|
r'\bmap of\b': 3.0,
|
|
r'\bphone number\b': 3.0,
|
|
r'\baddress of\b': 3.0,
|
|
r'\bopen(ing)? hours\b': 3.0,
|
|
|
|
# Weather/time
|
|
r'\bweather\b': 4.0,
|
|
r'\bforecast\b': 3.5,
|
|
r'\btemperature\b': 3.0,
|
|
r'\btime in\b': 3.0,
|
|
|
|
# News/recency patterns → Serper (or Tavily for news depth)
|
|
r'\blatest\b': 2.5,
|
|
r'\brecent\b': 2.5,
|
|
r'\btoday\b': 2.5,
|
|
r'\bbreaking\b': 3.5,
|
|
r'\bnews\b': 2.5,
|
|
r'\bheadlines?\b': 3.0,
|
|
r'\b202[4-9]\b': 2.0, # Current year mentions
|
|
r'\blast (week|month|year)\b': 2.0,
|
|
|
|
# German local patterns
|
|
r'\bin der nähe\b': 4.0,
|
|
r'\bin meiner nähe\b': 4.0,
|
|
r'\böffnungszeiten\b': 3.0,
|
|
r'\badresse von\b': 3.0,
|
|
r'\bweg(beschreibung)? nach\b': 3.5,
|
|
|
|
# German news/recency patterns
|
|
r'\bheute\b': 2.5,
|
|
r'\bmorgen\b': 2.0,
|
|
r'\baktuell\b': 2.5,
|
|
r'\bnachrichten\b': 3.0,
|
|
}
|
|
|
|
# RAG/AI signals → You.com
|
|
# You.com excels at providing LLM-ready snippets and combined web+news
|
|
RAG_SIGNALS = {
|
|
# RAG/context patterns (strong signal for You.com)
|
|
r'\brag\b': 4.5,
|
|
r'\bcontext for\b': 4.0,
|
|
r'\bsummarize\b': 3.5,
|
|
r'\bbrief(ly)?\b': 3.0,
|
|
r'\bquick overview\b': 3.5,
|
|
r'\btl;?dr\b': 4.0,
|
|
r'\bkey (points|facts|info)\b': 3.5,
|
|
r'\bmain (points|takeaways)\b': 3.5,
|
|
|
|
# Combined web + news queries
|
|
r'\b(web|online)\s+and\s+news\b': 4.0,
|
|
r'\ball sources\b': 3.5,
|
|
r'\bcomprehensive (search|overview)\b': 3.5,
|
|
r'\blatest\s+(news|updates)\b': 3.0,
|
|
r'\bcurrent (events|situation|status)\b': 3.5,
|
|
|
|
# Real-time information needs
|
|
r'\bright now\b': 3.0,
|
|
r'\bas of today\b': 3.5,
|
|
r'\bup.to.date\b': 3.5,
|
|
r'\breal.time\b': 4.0,
|
|
r'\blive\b': 2.5,
|
|
|
|
# Information synthesis
|
|
r'\bwhat\'?s happening with\b': 3.5,
|
|
r'\bwhat\'?s the latest\b': 4.0,
|
|
r'\bupdates?\s+on\b': 3.5,
|
|
r'\bstatus of\b': 3.0,
|
|
r'\bsituation (in|with|around)\b': 3.5,
|
|
}
|
|
|
|
# Direct answer / synthesis signals → Perplexity via Kilo Gateway
|
|
DIRECT_ANSWER_SIGNALS = {
|
|
r'\bwhat is\b': 3.0,
|
|
r'\bwhat are\b': 2.5,
|
|
r'\bcurrent status\b': 4.0,
|
|
r'\bstatus of\b': 3.5,
|
|
r'\bstatus\b': 2.5,
|
|
r'\bwhat happened with\b': 4.0,
|
|
r"\bwhat'?s happening with\b": 4.0,
|
|
r'\bas of (today|now)\b': 4.0,
|
|
r'\bthis weekend\b': 3.5,
|
|
r'\bevents? in\b': 3.5,
|
|
r'\bthings to do in\b': 4.0,
|
|
r'\bnear me\b': 3.0,
|
|
r'\bcan you (tell me|summarize|explain)\b': 3.5,
|
|
# German
|
|
r'\bwann\b': 3.0,
|
|
r'\bwer\b': 3.0,
|
|
r'\bwo\b': 2.5,
|
|
r'\bwie viele\b': 3.0,
|
|
}
|
|
|
|
# Privacy/Multi-source signals → SearXNG (self-hosted meta-search)
|
|
# SearXNG is ideal for privacy-focused queries and aggregating multiple sources
|
|
PRIVACY_SIGNALS = {
|
|
# Privacy signals (very strong)
|
|
r'\bprivate(ly)?\b': 4.0,
|
|
r'\banonymous(ly)?\b': 4.0,
|
|
r'\bwithout tracking\b': 4.5,
|
|
r'\bno track(ing)?\b': 4.5,
|
|
r'\bprivacy\b': 3.5,
|
|
r'\bprivacy.?focused\b': 4.5,
|
|
r'\bprivacy.?first\b': 4.5,
|
|
r'\bduckduckgo alternative\b': 4.5,
|
|
r'\bprivate search\b': 5.0,
|
|
|
|
# German privacy signals
|
|
r'\bprivat\b': 4.0,
|
|
r'\banonym\b': 4.0,
|
|
r'\bohne tracking\b': 4.5,
|
|
r'\bdatenschutz\b': 4.0,
|
|
|
|
# Multi-source aggregation signals
|
|
r'\baggregate results?\b': 4.0,
|
|
r'\bmultiple sources?\b': 4.0,
|
|
r'\bdiverse (results|perspectives|sources)\b': 4.0,
|
|
r'\bfrom (all|multiple|different) (engines?|sources?)\b': 4.5,
|
|
r'\bmeta.?search\b': 5.0,
|
|
r'\ball engines?\b': 4.0,
|
|
|
|
# German multi-source signals
|
|
r'\bverschiedene quellen\b': 4.0,
|
|
r'\baus mehreren quellen\b': 4.0,
|
|
r'\balle suchmaschinen\b': 4.5,
|
|
|
|
# Budget/free signals (SearXNG is self-hosted = $0 API cost)
|
|
r'\bfree search\b': 3.5,
|
|
r'\bno api cost\b': 4.0,
|
|
r'\bself.?hosted search\b': 5.0,
|
|
r'\bzero cost\b': 3.5,
|
|
r'\bbudget\b(?!\s*(laptop|phone|option))\b': 2.5, # "budget" alone, not "budget laptop"
|
|
|
|
# German budget signals
|
|
r'\bkostenlos(e)?\s+suche\b': 3.5,
|
|
r'\bkeine api.?kosten\b': 4.0,
|
|
}
|
|
|
|
# Exa Deep Search signals → deep multi-source synthesis
|
|
EXA_DEEP_SIGNALS = {
|
|
r'\bsynthesi[sz]e\b': 5.0,
|
|
r'\bdeep research\b': 5.0,
|
|
r'\bcomprehensive (analysis|report|overview|survey)\b': 4.5,
|
|
r'\bacross (multiple|many|several) (sources|documents|papers)\b': 4.5,
|
|
r'\baggregat(e|ing) (information|data|results)\b': 4.0,
|
|
r'\bcross.?referenc': 4.5,
|
|
r'\bsec filings?\b': 4.5,
|
|
r'\bannual reports?\b': 4.0,
|
|
r'\bearnings (call|report|transcript)\b': 4.5,
|
|
r'\bfinancial analysis\b': 4.0,
|
|
r'\bliterature (review|survey)\b': 5.0,
|
|
r'\bacademic literature\b': 4.5,
|
|
r'\bstate of the (art|field|industry)\b': 4.0,
|
|
r'\bcompile (a |the )?(report|findings|results)\b': 4.5,
|
|
r'\bsummariz(e|ing) (research|papers|studies)\b': 4.0,
|
|
r'\bmultiple documents?\b': 4.0,
|
|
r'\bdossier\b': 4.5,
|
|
r'\bdue diligence\b': 4.5,
|
|
r'\bstructured (output|data|report)\b': 4.0,
|
|
r'\bmarket research\b': 4.0,
|
|
r'\bindustry (report|analysis|overview)\b': 4.0,
|
|
r'\bresearch (on|about|into)\b': 4.0,
|
|
r'\bwhitepaper\b': 4.5,
|
|
r'\btechnical report\b': 4.0,
|
|
r'\bsurvey of\b': 4.5,
|
|
r'\bmeta.?analysis\b': 5.0,
|
|
r'\bsystematic review\b': 5.0,
|
|
r'\bcase study\b': 3.5,
|
|
r'\bbenchmark(s|ing)?\b': 3.5,
|
|
# German
|
|
r'\btiefenrecherche\b': 5.0,
|
|
r'\bumfassende (analyse|übersicht|recherche)\b': 4.5,
|
|
r'\baus mehreren quellen zusammenfassen\b': 4.5,
|
|
r'\bmarktforschung\b': 4.0,
|
|
}
|
|
|
|
# Exa Deep Reasoning signals → complex cross-reference analysis
|
|
EXA_DEEP_REASONING_SIGNALS = {
|
|
r'\bdeep.?reasoning\b': 6.0,
|
|
r'\bcomplex (analysis|reasoning|research)\b': 4.5,
|
|
r'\bcontradictions?\b': 4.5,
|
|
r'\breconcil(e|ing)\b': 5.0,
|
|
r'\bcritical(ly)? analyz': 4.5,
|
|
r'\bweigh(ing)? (the )?evidence\b': 4.5,
|
|
r'\bcompeting (claims|theories|perspectives)\b': 4.5,
|
|
r'\bcomplex financial\b': 4.5,
|
|
r'\bregulatory (analysis|compliance|landscape)\b': 4.5,
|
|
r'\blegal analysis\b': 4.5,
|
|
r'\bcomprehensive (due diligence|investigation)\b': 5.0,
|
|
r'\bpatent (landscape|analysis|search)\b': 4.5,
|
|
r'\bmarket intelligence\b': 4.5,
|
|
r'\bcompetitive (intelligence|landscape)\b': 4.5,
|
|
r'\btrade.?offs?\b': 4.0,
|
|
r'\bpros and cons of\b': 4.0,
|
|
r'\bshould I (use|choose|pick)\b': 3.5,
|
|
r'\bwhich is better\b': 4.0,
|
|
# German
|
|
r'\bkomplexe analyse\b': 4.5,
|
|
r'\bwidersprüche\b': 4.5,
|
|
r'\bquellen abwägen\b': 4.5,
|
|
r'\brechtliche analyse\b': 4.5,
|
|
r'\bvergleich(e|en)?\b': 3.5,
|
|
}
|
|
|
|
|
|
# Brand/product patterns for shopping detection
|
|
BRAND_PATTERNS = [
|
|
# Tech brands
|
|
r'\b(apple|iphone|ipad|macbook|airpods?)\b',
|
|
r'\b(samsung|galaxy)\b',
|
|
r'\b(google|pixel)\b',
|
|
r'\b(microsoft|surface|xbox)\b',
|
|
r'\b(sony|playstation)\b',
|
|
r'\b(nvidia|geforce|rtx)\b',
|
|
r'\b(amd|ryzen|radeon)\b',
|
|
r'\b(intel|core i[3579])\b',
|
|
r'\b(dell|hp|lenovo|asus|acer)\b',
|
|
r'\b(lg|tcl|hisense)\b',
|
|
|
|
# Product categories
|
|
r'\b(laptop|phone|tablet|tv|monitor|headphones?|earbuds?)\b',
|
|
r'\b(camera|lens|drone)\b',
|
|
r'\b(watch|smartwatch|fitbit|garmin)\b',
|
|
r'\b(router|modem|wifi)\b',
|
|
r'\b(keyboard|mouse|gaming)\b',
|
|
]
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
self.config = config
|
|
self.auto_config = config.get("auto_routing", DEFAULT_CONFIG["auto_routing"])
|
|
|
|
def _calculate_signal_score(
|
|
self,
|
|
query: str,
|
|
signals: Dict[str, float]
|
|
) -> Tuple[float, List[Dict[str, Any]]]:
|
|
"""
|
|
Calculate score for a signal category.
|
|
Returns (total_score, list of matched signals with details).
|
|
"""
|
|
query_lower = query.lower()
|
|
matches = []
|
|
total_score = 0.0
|
|
|
|
for pattern, weight in signals.items():
|
|
regex = re.compile(pattern, re.IGNORECASE)
|
|
found = regex.findall(query_lower)
|
|
if found:
|
|
# Normalize found matches
|
|
match_text = found[0] if isinstance(found[0], str) else found[0][0] if found[0] else pattern
|
|
matches.append({
|
|
"pattern": pattern,
|
|
"matched": match_text,
|
|
"weight": weight
|
|
})
|
|
total_score += weight
|
|
|
|
return total_score, matches
|
|
|
|
def _detect_product_brand_combo(self, query: str) -> float:
|
|
"""
|
|
Detect product + brand combinations which strongly indicate shopping intent.
|
|
Returns a bonus score.
|
|
"""
|
|
query_lower = query.lower()
|
|
brand_found = False
|
|
product_found = False
|
|
|
|
for pattern in self.BRAND_PATTERNS:
|
|
if re.search(pattern, query_lower, re.IGNORECASE):
|
|
brand_found = True
|
|
break
|
|
|
|
# Check for product indicators
|
|
product_indicators = [
|
|
r'\b(buy|price|specs?|review|vs|compare)\b',
|
|
r'\b(pro|max|plus|mini|ultra|lite)\b', # Product tier names
|
|
r'\b\d+\s*(gb|tb|inch|mm|hz)\b', # Specifications
|
|
]
|
|
for pattern in product_indicators:
|
|
if re.search(pattern, query_lower, re.IGNORECASE):
|
|
product_found = True
|
|
break
|
|
|
|
if brand_found and product_found:
|
|
return 3.0 # Strong shopping signal
|
|
elif brand_found:
|
|
return 1.5 # Moderate shopping signal
|
|
return 0.0
|
|
|
|
def _detect_url(self, query: str) -> Optional[str]:
|
|
"""Detect URLs in query - strong signal for Exa similar search."""
|
|
url_pattern = r'https?://[^\s]+'
|
|
match = re.search(url_pattern, query)
|
|
if match:
|
|
return match.group()
|
|
|
|
# Also check for domain-like patterns
|
|
domain_pattern = r'\b(\w+\.(com|org|io|ai|co|dev|net|app))\b'
|
|
match = re.search(domain_pattern, query, re.IGNORECASE)
|
|
if match:
|
|
return match.group()
|
|
|
|
return None
|
|
|
|
def _assess_query_complexity(self, query: str) -> Dict[str, Any]:
|
|
"""
|
|
Assess query complexity - complex queries favor Tavily.
|
|
"""
|
|
words = query.split()
|
|
word_count = len(words)
|
|
|
|
# Count question words
|
|
question_words = len(re.findall(
|
|
r'\b(what|why|how|when|where|which|who|whose|whom)\b',
|
|
query, re.IGNORECASE
|
|
))
|
|
|
|
# Check for multiple clauses
|
|
clause_markers = len(re.findall(
|
|
r'\b(and|but|or|because|since|while|although|if|when)\b',
|
|
query, re.IGNORECASE
|
|
))
|
|
|
|
complexity_score = 0.0
|
|
if word_count > 10:
|
|
complexity_score += 1.5
|
|
if word_count > 20:
|
|
complexity_score += 1.0
|
|
if question_words > 1:
|
|
complexity_score += 1.0
|
|
if clause_markers > 0:
|
|
complexity_score += 0.5 * clause_markers
|
|
|
|
return {
|
|
"word_count": word_count,
|
|
"question_words": question_words,
|
|
"clause_markers": clause_markers,
|
|
"complexity_score": complexity_score,
|
|
"is_complex": complexity_score > 2.0
|
|
}
|
|
|
|
def _detect_recency_intent(self, query: str) -> Tuple[bool, float]:
|
|
"""
|
|
Detect if query wants recent/timely information.
|
|
Returns (is_recency_focused, score).
|
|
"""
|
|
recency_patterns = [
|
|
(r'\b(latest|newest|recent|current)\b', 2.5),
|
|
(r'\b(today|yesterday|this week|this month)\b', 3.0),
|
|
(r'\b(202[4-9]|2030)\b', 2.0),
|
|
(r'\b(breaking|live|just|now)\b', 3.0),
|
|
(r'\blast (hour|day|week|month)\b', 2.5),
|
|
]
|
|
|
|
total = 0.0
|
|
for pattern, weight in recency_patterns:
|
|
if re.search(pattern, query, re.IGNORECASE):
|
|
total += weight
|
|
|
|
return total > 2.0, total
|
|
|
|
def analyze(self, query: str) -> Dict[str, Any]:
|
|
"""
|
|
Perform comprehensive query analysis.
|
|
Returns detailed analysis with scores for each provider.
|
|
"""
|
|
# Calculate scores for each intent category
|
|
shopping_score, shopping_matches = self._calculate_signal_score(
|
|
query, self.SHOPPING_SIGNALS
|
|
)
|
|
research_score, research_matches = self._calculate_signal_score(
|
|
query, self.RESEARCH_SIGNALS
|
|
)
|
|
discovery_score, discovery_matches = self._calculate_signal_score(
|
|
query, self.DISCOVERY_SIGNALS
|
|
)
|
|
local_news_score, local_news_matches = self._calculate_signal_score(
|
|
query, self.LOCAL_NEWS_SIGNALS
|
|
)
|
|
rag_score, rag_matches = self._calculate_signal_score(
|
|
query, self.RAG_SIGNALS
|
|
)
|
|
privacy_score, privacy_matches = self._calculate_signal_score(
|
|
query, self.PRIVACY_SIGNALS
|
|
)
|
|
direct_answer_score, direct_answer_matches = self._calculate_signal_score(
|
|
query, self.DIRECT_ANSWER_SIGNALS
|
|
)
|
|
exa_deep_score, exa_deep_matches = self._calculate_signal_score(
|
|
query, self.EXA_DEEP_SIGNALS
|
|
)
|
|
exa_deep_reasoning_score, exa_deep_reasoning_matches = self._calculate_signal_score(
|
|
query, self.EXA_DEEP_REASONING_SIGNALS
|
|
)
|
|
|
|
# Apply product/brand bonus to shopping
|
|
brand_bonus = self._detect_product_brand_combo(query)
|
|
if brand_bonus > 0:
|
|
shopping_score += brand_bonus
|
|
shopping_matches.append({
|
|
"pattern": "product_brand_combo",
|
|
"matched": "brand + product detected",
|
|
"weight": brand_bonus
|
|
})
|
|
|
|
# Detect URL → strong Exa signal
|
|
detected_url = self._detect_url(query)
|
|
if detected_url:
|
|
discovery_score += 5.0
|
|
discovery_matches.append({
|
|
"pattern": "url_detected",
|
|
"matched": detected_url,
|
|
"weight": 5.0
|
|
})
|
|
|
|
# Assess complexity → favors Tavily
|
|
complexity = self._assess_query_complexity(query)
|
|
if complexity["is_complex"]:
|
|
research_score += complexity["complexity_score"]
|
|
research_matches.append({
|
|
"pattern": "query_complexity",
|
|
"matched": f"complex query ({complexity['word_count']} words)",
|
|
"weight": complexity["complexity_score"]
|
|
})
|
|
|
|
# Check recency intent
|
|
is_recency, recency_score = self._detect_recency_intent(query)
|
|
|
|
# Map intents to providers with final scores
|
|
provider_scores = {
|
|
"serper": shopping_score + local_news_score + (recency_score * 0.35),
|
|
"tavily": research_score + (complexity["complexity_score"] if not complexity["is_complex"] else 0) + (0.2 * recency_score),
|
|
"querit": (research_score * 0.65) + (rag_score * 0.35) + (recency_score * 0.45),
|
|
"exa": discovery_score + (1.0 if re.search(r"\b(similar|alternatives?|examples?)\b", query, re.IGNORECASE) else 0.0) + (exa_deep_score * 0.5) + (exa_deep_reasoning_score * 0.5),
|
|
"perplexity": direct_answer_score + (local_news_score * 0.4) + (recency_score * 0.55),
|
|
"you": rag_score + (recency_score * 0.25), # You.com good for real-time + RAG
|
|
"searxng": privacy_score, # SearXNG for privacy/multi-source queries
|
|
}
|
|
|
|
# Build match details per provider
|
|
provider_matches = {
|
|
"serper": shopping_matches + local_news_matches,
|
|
"tavily": research_matches,
|
|
"querit": research_matches,
|
|
"exa": discovery_matches + exa_deep_matches + exa_deep_reasoning_matches,
|
|
"perplexity": direct_answer_matches,
|
|
"you": rag_matches,
|
|
"searxng": privacy_matches,
|
|
}
|
|
|
|
return {
|
|
"query": query,
|
|
"provider_scores": provider_scores,
|
|
"provider_matches": provider_matches,
|
|
"detected_url": detected_url,
|
|
"complexity": complexity,
|
|
"recency_focused": is_recency,
|
|
"recency_score": recency_score,
|
|
"exa_deep_score": exa_deep_score,
|
|
"exa_deep_reasoning_score": exa_deep_reasoning_score,
|
|
}
|
|
|
|
def route(self, query: str) -> Dict[str, Any]:
|
|
"""
|
|
Route query to optimal provider with confidence scoring.
|
|
"""
|
|
analysis = self.analyze(query)
|
|
scores = analysis["provider_scores"]
|
|
|
|
# Filter to available providers
|
|
disabled = set(self.auto_config.get("disabled_providers", []))
|
|
available = {
|
|
p: s for p, s in scores.items()
|
|
if p not in disabled and get_api_key(p, self.config)
|
|
}
|
|
|
|
if not available:
|
|
# No providers available, use fallback
|
|
fallback = self.auto_config.get("fallback_provider", "serper")
|
|
return {
|
|
"provider": fallback,
|
|
"confidence": 0.0,
|
|
"confidence_level": "low",
|
|
"reason": "no_available_providers",
|
|
"scores": scores,
|
|
"top_signals": [],
|
|
"analysis": analysis,
|
|
}
|
|
|
|
# Find the winner
|
|
max_score = max(available.values())
|
|
total_score = sum(available.values()) or 1.0
|
|
|
|
# Handle ties using priority
|
|
priority = self.auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"])
|
|
winners = [p for p, s in available.items() if s == max_score]
|
|
|
|
if len(winners) > 1:
|
|
# Use priority to break tie
|
|
for p in priority:
|
|
if p in winners:
|
|
winner = p
|
|
break
|
|
else:
|
|
winner = winners[0]
|
|
else:
|
|
winner = winners[0]
|
|
|
|
# Calculate confidence
|
|
# High confidence = clear winner with good margin
|
|
if max_score == 0:
|
|
confidence = 0.0
|
|
reason = "no_signals_matched"
|
|
else:
|
|
# Confidence based on:
|
|
# 1. Absolute score (is it strong enough?)
|
|
# 2. Relative margin (is there a clear winner?)
|
|
second_best = sorted(available.values(), reverse=True)[1] if len(available) > 1 else 0
|
|
margin = (max_score - second_best) / max_score if max_score > 0 else 0
|
|
|
|
# Normalize score to 0-1 range (assuming max reasonable score ~15)
|
|
normalized_score = min(max_score / 15.0, 1.0)
|
|
|
|
# Confidence is combination of absolute strength and relative margin
|
|
confidence = round((normalized_score * 0.6 + margin * 0.4), 3)
|
|
|
|
if confidence >= 0.7:
|
|
reason = "high_confidence_match"
|
|
elif confidence >= 0.4:
|
|
reason = "moderate_confidence_match"
|
|
else:
|
|
reason = "low_confidence_match"
|
|
|
|
# Get top signals for the winning provider
|
|
matches = analysis["provider_matches"].get(winner, [])
|
|
top_signals = sorted(matches, key=lambda x: x["weight"], reverse=True)[:5]
|
|
|
|
# Special case: URL detected and Exa available → strong recommendation
|
|
if analysis["detected_url"] and "exa" in available:
|
|
if winner != "exa":
|
|
# Override if URL is present but didn't win
|
|
# (user might want similar search)
|
|
pass # Keep current winner but note it
|
|
|
|
# Determine Exa search depth when routed to Exa
|
|
exa_depth = "normal"
|
|
if winner == "exa":
|
|
deep_r_score = analysis.get("exa_deep_reasoning_score", 0)
|
|
deep_score = analysis.get("exa_deep_score", 0)
|
|
if deep_r_score >= 4.0:
|
|
exa_depth = "deep-reasoning"
|
|
elif deep_score >= 4.0:
|
|
exa_depth = "deep"
|
|
|
|
# Build detailed routing result
|
|
threshold = self.auto_config.get("confidence_threshold", 0.3)
|
|
|
|
return {
|
|
"provider": winner,
|
|
"confidence": confidence,
|
|
"confidence_level": "high" if confidence >= 0.7 else "medium" if confidence >= 0.4 else "low",
|
|
"reason": reason,
|
|
"exa_depth": exa_depth,
|
|
"scores": {p: round(s, 2) for p, s in available.items()},
|
|
"winning_score": round(max_score, 2),
|
|
"top_signals": [
|
|
{"matched": s["matched"], "weight": s["weight"]}
|
|
for s in top_signals
|
|
],
|
|
"below_threshold": confidence < threshold,
|
|
"analysis_summary": {
|
|
"query_length": len(query.split()),
|
|
"is_complex": analysis["complexity"]["is_complex"],
|
|
"has_url": analysis["detected_url"] is not None,
|
|
"recency_focused": analysis["recency_focused"],
|
|
}
|
|
}
|
|
|
|
|
|
def auto_route_provider(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Intelligently route query to the best provider.
|
|
Returns detailed routing decision with confidence.
|
|
"""
|
|
analyzer = QueryAnalyzer(config)
|
|
return analyzer.route(query)
|
|
|
|
|
|
def explain_routing(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Provide detailed explanation of routing decision for debugging.
|
|
"""
|
|
analyzer = QueryAnalyzer(config)
|
|
analysis = analyzer.analyze(query)
|
|
routing = analyzer.route(query)
|
|
|
|
return {
|
|
"query": query,
|
|
"routing_decision": {
|
|
"provider": routing["provider"],
|
|
"confidence": routing["confidence"],
|
|
"confidence_level": routing["confidence_level"],
|
|
"reason": routing["reason"],
|
|
"exa_depth": routing.get("exa_depth", "normal"),
|
|
},
|
|
"scores": routing["scores"],
|
|
"top_signals": routing["top_signals"],
|
|
"intent_breakdown": {
|
|
"shopping_signals": len(analysis["provider_matches"]["serper"]),
|
|
"research_signals": len(analysis["provider_matches"]["tavily"]),
|
|
"querit_signals": len(analysis["provider_matches"]["querit"]),
|
|
"discovery_signals": len(analysis["provider_matches"]["exa"]),
|
|
"rag_signals": len(analysis["provider_matches"]["you"]),
|
|
"exa_deep_score": round(analysis.get("exa_deep_score", 0), 2),
|
|
"exa_deep_reasoning_score": round(analysis.get("exa_deep_reasoning_score", 0), 2),
|
|
},
|
|
"query_analysis": {
|
|
"word_count": analysis["complexity"]["word_count"],
|
|
"is_complex": analysis["complexity"]["is_complex"],
|
|
"complexity_score": round(analysis["complexity"]["complexity_score"], 2),
|
|
"has_url": analysis["detected_url"],
|
|
"recency_focused": analysis["recency_focused"],
|
|
},
|
|
"all_matches": {
|
|
provider: [
|
|
{"matched": m["matched"], "weight": m["weight"]}
|
|
for m in matches
|
|
]
|
|
for provider, matches in analysis["provider_matches"].items()
|
|
if matches
|
|
},
|
|
"available_providers": [
|
|
p for p in ["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng"]
|
|
if get_api_key(p, config) and p not in config.get("auto_routing", {}).get("disabled_providers", [])
|
|
]
|
|
}
|
|
|
|
|
|
|
|
|
|
class ProviderConfigError(Exception):
|
|
"""Raised when a provider is missing or has an invalid API key/config."""
|
|
pass
|
|
|
|
|
|
class ProviderRequestError(Exception):
|
|
"""Structured provider error with retry/cooldown metadata."""
|
|
|
|
def __init__(self, message: str, status_code: Optional[int] = None, transient: bool = False):
|
|
super().__init__(message)
|
|
self.status_code = status_code
|
|
self.transient = transient
|
|
|
|
|
|
TRANSIENT_HTTP_CODES = {429, 503}
|
|
COOLDOWN_STEPS_SECONDS = [60, 300, 1500, 3600] # 1m -> 5m -> 25m -> 1h cap
|
|
RETRY_BACKOFF_SECONDS = [1, 3, 9]
|
|
|
|
|
|
def _ensure_parent(path: Path) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def _load_provider_health() -> Dict[str, Any]:
|
|
if not PROVIDER_HEALTH_FILE.exists():
|
|
return {}
|
|
try:
|
|
with open(PROVIDER_HEALTH_FILE, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return data if isinstance(data, dict) else {}
|
|
except (json.JSONDecodeError, IOError):
|
|
return {}
|
|
|
|
|
|
def _save_provider_health(state: Dict[str, Any]) -> None:
|
|
_ensure_parent(PROVIDER_HEALTH_FILE)
|
|
with open(PROVIDER_HEALTH_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(state, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def provider_in_cooldown(provider: str) -> Tuple[bool, int]:
|
|
state = _load_provider_health()
|
|
pstate = state.get(provider, {})
|
|
cooldown_until = int(pstate.get("cooldown_until", 0) or 0)
|
|
remaining = cooldown_until - int(time.time())
|
|
return (remaining > 0, max(0, remaining))
|
|
|
|
|
|
def mark_provider_failure(provider: str, error_message: str) -> Dict[str, Any]:
|
|
state = _load_provider_health()
|
|
now = int(time.time())
|
|
pstate = state.get(provider, {})
|
|
fail_count = int(pstate.get("failure_count", 0)) + 1
|
|
cooldown_seconds = COOLDOWN_STEPS_SECONDS[min(fail_count - 1, len(COOLDOWN_STEPS_SECONDS) - 1)]
|
|
state[provider] = {
|
|
"failure_count": fail_count,
|
|
"cooldown_until": now + cooldown_seconds,
|
|
"cooldown_seconds": cooldown_seconds,
|
|
"last_error": error_message,
|
|
"last_failure_at": now,
|
|
}
|
|
_save_provider_health(state)
|
|
return state[provider]
|
|
|
|
|
|
def reset_provider_health(provider: str) -> None:
|
|
state = _load_provider_health()
|
|
if provider in state:
|
|
state.pop(provider, None)
|
|
_save_provider_health(state)
|
|
|
|
|
|
def _title_from_url(url: str) -> str:
|
|
"""Derive a readable title from a URL when none is provided."""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.replace("www.", "")
|
|
# Use last meaningful path segment as context
|
|
segments = [s for s in parsed.path.strip("/").split("/") if s]
|
|
if segments:
|
|
last = segments[-1].replace("-", " ").replace("_", " ")
|
|
# Strip file extensions
|
|
last = re.sub(r'\.\w{2,4}$', '', last)
|
|
if last:
|
|
return f"{domain} — {last[:80]}"
|
|
return domain
|
|
except Exception:
|
|
return url[:60]
|
|
|
|
|
|
def normalize_result_url(url: str) -> str:
|
|
if not url:
|
|
return ""
|
|
parsed = urlparse(url.strip())
|
|
netloc = (parsed.netloc or "").lower()
|
|
if netloc.startswith("www."):
|
|
netloc = netloc[4:]
|
|
path = parsed.path.rstrip("/")
|
|
return f"{netloc}{path}"
|
|
|
|
|
|
def deduplicate_results_across_providers(results_by_provider: List[Tuple[str, Dict[str, Any]]], max_results: int) -> Tuple[List[Dict[str, Any]], int]:
|
|
deduped = []
|
|
seen = set()
|
|
dedup_count = 0
|
|
for provider_name, data in results_by_provider:
|
|
for item in data.get("results", []):
|
|
norm = normalize_result_url(item.get("url", ""))
|
|
if norm and norm in seen:
|
|
dedup_count += 1
|
|
continue
|
|
if norm:
|
|
seen.add(norm)
|
|
item = item.copy()
|
|
item.setdefault("provider", provider_name)
|
|
deduped.append(item)
|
|
if len(deduped) >= max_results:
|
|
return deduped, dedup_count
|
|
return deduped, dedup_count
|
|
|
|
# =============================================================================
|
|
# HTTP Client
|
|
# =============================================================================
|
|
|
|
def make_request(url: str, headers: dict, body: dict, timeout: int = 30) -> dict:
|
|
"""Make HTTP POST request and return JSON response."""
|
|
# Ensure User-Agent is set (required by some APIs like Exa/Cloudflare)
|
|
if "User-Agent" not in headers:
|
|
headers["User-Agent"] = "ClawdBot-WebSearchPlus/2.1"
|
|
data = json.dumps(body).encode("utf-8")
|
|
req = Request(url, data=data, headers=headers, method="POST")
|
|
|
|
try:
|
|
with urlopen(req, timeout=timeout) as response:
|
|
return json.loads(response.read().decode("utf-8"))
|
|
except HTTPError as e:
|
|
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
try:
|
|
error_json = json.loads(error_body)
|
|
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
except json.JSONDecodeError:
|
|
error_detail = error_body[:500]
|
|
|
|
error_messages = {
|
|
401: "Invalid or expired API key. Please check your credentials.",
|
|
403: "Access forbidden. Your API key may not have permission for this operation.",
|
|
429: "Rate limit exceeded. Please wait a moment and try again.",
|
|
500: "Server error. The search provider is experiencing issues.",
|
|
503: "Service unavailable. The search provider may be down."
|
|
}
|
|
|
|
friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
|
|
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
except URLError as e:
|
|
reason = str(getattr(e, "reason", e))
|
|
is_timeout = "timed out" in reason.lower()
|
|
raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
|
|
except IncompleteRead as e:
|
|
partial_len = len(getattr(e, "partial", b"") or b"")
|
|
raise ProviderRequestError(
|
|
f"Connection interrupted while reading response ({partial_len} bytes received). Please retry.",
|
|
transient=True,
|
|
)
|
|
except TimeoutError:
|
|
raise ProviderRequestError(f"Request timed out after {timeout}s. Try again or reduce max_results.", transient=True)
|
|
|
|
|
|
# =============================================================================
|
|
# Serper (Google Search API)
|
|
# =============================================================================
|
|
|
|
def search_serper(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
country: str = "us",
|
|
language: str = "en",
|
|
search_type: str = "search",
|
|
time_range: Optional[str] = None,
|
|
include_images: bool = False,
|
|
) -> dict:
|
|
"""Search using Serper (Google Search API)."""
|
|
endpoint = f"https://google.serper.dev/{search_type}"
|
|
|
|
body = {
|
|
"q": query,
|
|
"gl": country,
|
|
"hl": language,
|
|
"num": max_results,
|
|
"autocorrect": True,
|
|
}
|
|
|
|
if time_range and time_range != "none":
|
|
tbs_map = {
|
|
"hour": "qdr:h",
|
|
"day": "qdr:d",
|
|
"week": "qdr:w",
|
|
"month": "qdr:m",
|
|
"year": "qdr:y",
|
|
}
|
|
if time_range in tbs_map:
|
|
body["tbs"] = tbs_map[time_range]
|
|
|
|
headers = {
|
|
"X-API-KEY": api_key,
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
data = make_request(endpoint, headers, body)
|
|
|
|
results = []
|
|
for i, item in enumerate(data.get("organic", [])[:max_results]):
|
|
results.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("link", ""),
|
|
"snippet": item.get("snippet", ""),
|
|
"score": round(1.0 - i * 0.1, 2),
|
|
"date": item.get("date"),
|
|
})
|
|
|
|
answer = ""
|
|
if data.get("answerBox", {}).get("answer"):
|
|
answer = data["answerBox"]["answer"]
|
|
elif data.get("answerBox", {}).get("snippet"):
|
|
answer = data["answerBox"]["snippet"]
|
|
elif data.get("knowledgeGraph", {}).get("description"):
|
|
answer = data["knowledgeGraph"]["description"]
|
|
elif results:
|
|
answer = results[0]["snippet"]
|
|
|
|
images = []
|
|
if include_images:
|
|
try:
|
|
img_data = make_request(
|
|
"https://google.serper.dev/images",
|
|
headers,
|
|
{"q": query, "gl": country, "hl": language, "num": 5},
|
|
)
|
|
images = [img.get("imageUrl", "") for img in img_data.get("images", [])[:5] if img.get("imageUrl")]
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"provider": "serper",
|
|
"query": query,
|
|
"results": results,
|
|
"images": images,
|
|
"answer": answer,
|
|
"knowledge_graph": data.get("knowledgeGraph"),
|
|
"related_searches": [r.get("query") for r in data.get("relatedSearches", [])]
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Tavily (Research Search)
|
|
# =============================================================================
|
|
|
|
def search_tavily(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
depth: str = "basic",
|
|
topic: str = "general",
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
include_images: bool = False,
|
|
include_raw_content: bool = False,
|
|
) -> dict:
|
|
"""Search using Tavily (AI Research Search)."""
|
|
endpoint = "https://api.tavily.com/search"
|
|
|
|
body = {
|
|
"api_key": api_key,
|
|
"query": query,
|
|
"max_results": max_results,
|
|
"search_depth": depth,
|
|
"topic": topic,
|
|
"include_images": include_images,
|
|
"include_answer": True,
|
|
"include_raw_content": include_raw_content,
|
|
}
|
|
|
|
if include_domains:
|
|
body["include_domains"] = include_domains
|
|
if exclude_domains:
|
|
body["exclude_domains"] = exclude_domains
|
|
|
|
headers = {"Content-Type": "application/json"}
|
|
|
|
data = make_request(endpoint, headers, body)
|
|
|
|
results = []
|
|
for item in data.get("results", [])[:max_results]:
|
|
result = {
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": item.get("content", ""),
|
|
"score": round(item.get("score", 0.0), 3),
|
|
}
|
|
if include_raw_content and item.get("raw_content"):
|
|
result["raw_content"] = item["raw_content"]
|
|
results.append(result)
|
|
|
|
return {
|
|
"provider": "tavily",
|
|
"query": query,
|
|
"results": results,
|
|
"images": data.get("images", []),
|
|
"answer": data.get("answer", ""),
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Querit (Multi-lingual search API for AI, with rich metadata and real-time information)
|
|
# =============================================================================
|
|
|
|
def _map_querit_time_range(time_range: Optional[str]) -> Optional[str]:
|
|
"""Map generic time ranges to Querit's compact date filter format."""
|
|
if not time_range:
|
|
return None
|
|
return {
|
|
"day": "d1",
|
|
"week": "w1",
|
|
"month": "m1",
|
|
"year": "y1",
|
|
}.get(time_range, time_range)
|
|
|
|
|
|
def search_querit(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
language: str = "en",
|
|
country: str = "us",
|
|
time_range: Optional[str] = None,
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
base_url: str = "https://api.querit.ai",
|
|
base_path: str = "/v1/search",
|
|
timeout: int = 30,
|
|
) -> dict:
|
|
"""Search using Querit.
|
|
|
|
Mirrors the Querit Python SDK payload shape:
|
|
- query
|
|
- count
|
|
- optional filters: languages, geo, sites, timeRange
|
|
"""
|
|
endpoint = base_url.rstrip("/") + base_path
|
|
|
|
filters: Dict[str, Any] = {}
|
|
if language:
|
|
filters["languages"] = {"include": [language.lower()]}
|
|
if country:
|
|
filters["geo"] = {"countries": {"include": [country.upper()]}}
|
|
if include_domains or exclude_domains:
|
|
sites: Dict[str, List[str]] = {}
|
|
if include_domains:
|
|
sites["include"] = include_domains
|
|
if exclude_domains:
|
|
sites["exclude"] = exclude_domains
|
|
filters["sites"] = sites
|
|
|
|
querit_time_range = _map_querit_time_range(time_range)
|
|
if querit_time_range:
|
|
filters["timeRange"] = {"date": querit_time_range}
|
|
|
|
body: Dict[str, Any] = {
|
|
"query": query,
|
|
"count": max_results,
|
|
}
|
|
if filters:
|
|
body["filters"] = filters
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
data = make_request(endpoint, headers, body, timeout=timeout)
|
|
|
|
error_code = data.get("error_code")
|
|
error_msg = data.get("error_msg")
|
|
if error_msg or (error_code not in (None, 0, 200)):
|
|
message = error_msg or f"Querit request failed with error_code={error_code}"
|
|
raise ProviderRequestError(message)
|
|
|
|
raw_results = ((data.get("results") or {}).get("result")) or []
|
|
results = []
|
|
for i, item in enumerate(raw_results[:max_results]):
|
|
snippet = item.get("snippet") or item.get("page_age") or ""
|
|
result = {
|
|
"title": item.get("title") or _title_from_url(item.get("url", "")),
|
|
"url": item.get("url", ""),
|
|
"snippet": snippet,
|
|
"score": round(1.0 - i * 0.05, 3),
|
|
}
|
|
if item.get("page_time") is not None:
|
|
result["page_time"] = item["page_time"]
|
|
if item.get("page_age"):
|
|
result["date"] = item["page_age"]
|
|
if item.get("language") is not None:
|
|
result["language"] = item["language"]
|
|
results.append(result)
|
|
|
|
answer = results[0]["snippet"] if results else ""
|
|
|
|
return {
|
|
"provider": "querit",
|
|
"query": query,
|
|
"results": results,
|
|
"images": [],
|
|
"answer": answer,
|
|
"metadata": {
|
|
"search_id": data.get("search_id"),
|
|
"time_range": querit_time_range,
|
|
}
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Exa (Neural/Semantic/Deep Search)
|
|
# =============================================================================
|
|
|
|
def search_exa(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
search_type: str = "neural",
|
|
exa_depth: str = "normal",
|
|
category: Optional[str] = None,
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None,
|
|
similar_url: Optional[str] = None,
|
|
include_domains: Optional[List[str]] = None,
|
|
exclude_domains: Optional[List[str]] = None,
|
|
text_verbosity: str = "standard",
|
|
) -> dict:
|
|
"""Search using Exa (Neural/Semantic/Deep Search).
|
|
|
|
exa_depth controls synthesis level:
|
|
- "normal": standard search (neural/fast/auto/keyword/instant)
|
|
- "deep": multi-source synthesis with grounding (4-12s, $12/1k)
|
|
- "deep-reasoning": cross-reference reasoning with grounding (12-50s, $15/1k)
|
|
"""
|
|
is_deep = exa_depth in ("deep", "deep-reasoning")
|
|
|
|
if similar_url:
|
|
# findSimilar does not support deep search types
|
|
endpoint = "https://api.exa.ai/findSimilar"
|
|
body: Dict[str, Any] = {
|
|
"url": similar_url,
|
|
"numResults": max_results,
|
|
"contents": {
|
|
"text": {"maxCharacters": 2000, "verbosity": text_verbosity},
|
|
"highlights": {"numSentences": 3, "highlightsPerUrl": 2},
|
|
},
|
|
}
|
|
elif is_deep:
|
|
endpoint = "https://api.exa.ai/search"
|
|
body = {
|
|
"query": query,
|
|
"numResults": max_results,
|
|
"type": exa_depth,
|
|
"contents": {
|
|
"text": {"maxCharacters": 5000, "verbosity": "full"},
|
|
},
|
|
}
|
|
else:
|
|
endpoint = "https://api.exa.ai/search"
|
|
body = {
|
|
"query": query,
|
|
"numResults": max_results,
|
|
"type": search_type,
|
|
"contents": {
|
|
"text": {"maxCharacters": 2000, "verbosity": text_verbosity},
|
|
"highlights": {"numSentences": 3, "highlightsPerUrl": 2},
|
|
},
|
|
}
|
|
|
|
if category:
|
|
body["category"] = category
|
|
if start_date:
|
|
body["startPublishedDate"] = start_date
|
|
if end_date:
|
|
body["endPublishedDate"] = end_date
|
|
if include_domains:
|
|
body["includeDomains"] = include_domains
|
|
if exclude_domains:
|
|
body["excludeDomains"] = exclude_domains
|
|
|
|
headers = {
|
|
"x-api-key": api_key,
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
timeout = 55 if is_deep else 30
|
|
data = make_request(endpoint, headers, body, timeout=timeout)
|
|
|
|
results = []
|
|
|
|
# Deep search: primary content in output field with grounding citations
|
|
if is_deep:
|
|
deep_output = data.get("output", {})
|
|
synthesized_text = ""
|
|
grounding_citations: List[Dict[str, Any]] = []
|
|
|
|
if isinstance(deep_output.get("content"), str):
|
|
synthesized_text = deep_output["content"]
|
|
elif isinstance(deep_output.get("content"), dict):
|
|
synthesized_text = json.dumps(deep_output["content"], ensure_ascii=False)
|
|
|
|
for field_citation in deep_output.get("grounding", []):
|
|
for cite in field_citation.get("citations", []):
|
|
grounding_citations.append({
|
|
"url": cite.get("url", ""),
|
|
"title": cite.get("title", ""),
|
|
"confidence": field_citation.get("confidence", ""),
|
|
"field": field_citation.get("field", ""),
|
|
})
|
|
|
|
# Primary synthesized result
|
|
if synthesized_text:
|
|
results.append({
|
|
"title": f"Exa {exa_depth.replace('-', ' ').title()} Synthesis",
|
|
"url": "",
|
|
"snippet": synthesized_text,
|
|
"full_synthesis": synthesized_text,
|
|
"score": 1.0,
|
|
"grounding": grounding_citations[:10],
|
|
"type": "synthesis",
|
|
})
|
|
|
|
# Supporting source documents
|
|
for item in data.get("results", [])[:max_results]:
|
|
text_content = item.get("text", "") or ""
|
|
highlights = item.get("highlights", [])
|
|
snippet = text_content[:800] if text_content else (highlights[0] if highlights else "")
|
|
results.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": snippet,
|
|
"score": round(item.get("score", 0.0), 3),
|
|
"published_date": item.get("publishedDate"),
|
|
"author": item.get("author"),
|
|
"type": "source",
|
|
})
|
|
|
|
answer = synthesized_text if synthesized_text else (results[1]["snippet"] if len(results) > 1 else "")
|
|
|
|
return {
|
|
"provider": "exa",
|
|
"query": query,
|
|
"exa_depth": exa_depth,
|
|
"results": results,
|
|
"images": [],
|
|
"answer": answer,
|
|
"grounding": grounding_citations,
|
|
"metadata": {
|
|
"synthesis_length": len(synthesized_text),
|
|
"source_count": len(data.get("results", [])),
|
|
},
|
|
}
|
|
|
|
# Standard search result parsing
|
|
for item in data.get("results", [])[:max_results]:
|
|
text_content = item.get("text", "") or ""
|
|
highlights = item.get("highlights", [])
|
|
if text_content:
|
|
snippet = text_content[:800]
|
|
elif highlights:
|
|
snippet = " ... ".join(highlights[:2])
|
|
else:
|
|
snippet = ""
|
|
|
|
results.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": snippet,
|
|
"score": round(item.get("score", 0.0), 3),
|
|
"published_date": item.get("publishedDate"),
|
|
"author": item.get("author"),
|
|
})
|
|
|
|
answer = results[0]["snippet"] if results else ""
|
|
|
|
return {
|
|
"provider": "exa",
|
|
"query": query if not similar_url else f"Similar to: {similar_url}",
|
|
"results": results,
|
|
"images": [],
|
|
"answer": answer,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Perplexity via Kilo Gateway (Synthesized Direct Answers)
|
|
# =============================================================================
|
|
|
|
def search_perplexity(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
model: str = "perplexity/sonar-pro",
|
|
api_url: str = "https://api.kilo.ai/api/gateway/chat/completions",
|
|
freshness: Optional[str] = None,
|
|
) -> dict:
|
|
"""Search/answer using Perplexity Sonar Pro via Kilo Gateway.
|
|
|
|
Args:
|
|
query: Search query
|
|
api_key: Kilo Gateway API key
|
|
max_results: Maximum results to return
|
|
model: Perplexity model to use
|
|
api_url: Kilo Gateway endpoint
|
|
freshness: Filter by recency — 'day', 'week', 'month', 'year' (maps to
|
|
Perplexity's search_recency_filter parameter)
|
|
"""
|
|
# Map generic freshness values to Perplexity's search_recency_filter
|
|
recency_map = {"day": "day", "pd": "day", "week": "week", "pw": "week", "month": "month", "pm": "month", "year": "year", "py": "year"}
|
|
recency_filter = recency_map.get(freshness or "", None)
|
|
|
|
body = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": "Answer with concise factual summary and include source URLs."},
|
|
{"role": "user", "content": query},
|
|
],
|
|
"temperature": 0.2,
|
|
}
|
|
if recency_filter:
|
|
body["search_recency_filter"] = recency_filter
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
data = make_request(api_url, headers, body)
|
|
choices = data.get("choices", [])
|
|
message = choices[0].get("message", {}) if choices else {}
|
|
answer = (message.get("content") or "").strip()
|
|
|
|
# Prefer the structured citations array from Perplexity API response
|
|
api_citations = data.get("citations", [])
|
|
|
|
# Fallback: extract URLs from answer text if API doesn't provide citations
|
|
if not api_citations:
|
|
api_citations = []
|
|
seen = set()
|
|
for u in re.findall(r"https?://[^\s)\]}>\"']+", answer):
|
|
if u not in seen:
|
|
seen.add(u)
|
|
api_citations.append(u)
|
|
|
|
results = []
|
|
|
|
# Primary result: the synthesized answer itself
|
|
if answer:
|
|
# Clean citation markers [1][2] for the snippet
|
|
clean_answer = re.sub(r'\[\d+\]', '', answer).strip()
|
|
results.append({
|
|
"title": f"Perplexity Answer: {query[:80]}",
|
|
"url": "https://www.perplexity.ai",
|
|
"snippet": clean_answer[:500],
|
|
"score": 1.0,
|
|
})
|
|
|
|
# Source results from citations
|
|
for i, citation in enumerate(api_citations[:max_results - 1]):
|
|
# citations can be plain URL strings or dicts with url/title
|
|
if isinstance(citation, str):
|
|
url = citation
|
|
title = _title_from_url(url)
|
|
else:
|
|
url = citation.get("url", "")
|
|
title = citation.get("title") or _title_from_url(url)
|
|
results.append({
|
|
"title": title,
|
|
"url": url,
|
|
"snippet": f"Source cited in Perplexity answer [citation {i+1}]",
|
|
"score": round(0.9 - i * 0.1, 3),
|
|
})
|
|
|
|
return {
|
|
"provider": "perplexity",
|
|
"query": query,
|
|
"results": results,
|
|
"images": [],
|
|
"answer": answer,
|
|
"metadata": {
|
|
"model": model,
|
|
"usage": data.get("usage", {}),
|
|
}
|
|
}
|
|
|
|
|
|
|
|
# =============================================================================
|
|
# You.com (LLM-Ready Web & News Search)
|
|
# =============================================================================
|
|
|
|
def search_you(
|
|
query: str,
|
|
api_key: str,
|
|
max_results: int = 5,
|
|
country: str = "US",
|
|
language: str = "en",
|
|
freshness: Optional[str] = None,
|
|
safesearch: str = "moderate",
|
|
include_news: bool = True,
|
|
livecrawl: Optional[str] = None,
|
|
) -> dict:
|
|
"""Search using You.com (LLM-Ready Web & News Search).
|
|
|
|
You.com excels at:
|
|
- RAG applications with pre-extracted snippets
|
|
- Combined web + news results in one call
|
|
- Real-time information with automatic news classification
|
|
- Clean, structured JSON optimized for AI consumption
|
|
|
|
Args:
|
|
query: Search query
|
|
api_key: You.com API key
|
|
max_results: Maximum results to return (default 5, max 100)
|
|
country: ISO 3166-2 country code (e.g., US, GB, DE)
|
|
language: BCP 47 language code (e.g., en, de, fr)
|
|
freshness: Filter by recency: day, week, month, year, or YYYY-MM-DDtoYYYY-MM-DD
|
|
safesearch: Content filter: off, moderate (default), strict
|
|
include_news: Include news results when relevant (default True)
|
|
livecrawl: Fetch full page content: "web", "news", or "all"
|
|
"""
|
|
endpoint = "https://ydc-index.io/v1/search"
|
|
|
|
# Build query parameters
|
|
params = {
|
|
"query": query,
|
|
"count": max_results,
|
|
"safesearch": safesearch,
|
|
}
|
|
|
|
if country:
|
|
params["country"] = country.upper()
|
|
if language:
|
|
params["language"] = language.upper()
|
|
if freshness:
|
|
params["freshness"] = freshness
|
|
if livecrawl:
|
|
params["livecrawl"] = livecrawl
|
|
params["livecrawl_formats"] = "markdown"
|
|
|
|
# Build URL with query params (URL-encode values)
|
|
query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
|
|
url = f"{endpoint}?{query_string}"
|
|
|
|
headers = {
|
|
"X-API-KEY": api_key,
|
|
"Accept": "application/json",
|
|
"User-Agent": "ClawdBot-WebSearchPlus/2.4",
|
|
}
|
|
|
|
# Make GET request (You.com uses GET, not POST)
|
|
from urllib.request import Request, urlopen
|
|
req = Request(url, headers=headers, method="GET")
|
|
|
|
try:
|
|
with urlopen(req, timeout=30) as response:
|
|
data = json.loads(response.read().decode("utf-8"))
|
|
except HTTPError as e:
|
|
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
try:
|
|
error_json = json.loads(error_body)
|
|
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
except json.JSONDecodeError:
|
|
error_detail = error_body[:500]
|
|
|
|
error_messages = {
|
|
401: "Invalid or expired API key. Get one at https://api.you.com",
|
|
403: "Access forbidden. Check your API key permissions.",
|
|
429: "Rate limit exceeded. Please wait and try again.",
|
|
500: "You.com server error. Try again later.",
|
|
503: "You.com service unavailable."
|
|
}
|
|
friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
|
|
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
except URLError as e:
|
|
reason = str(getattr(e, "reason", e))
|
|
is_timeout = "timed out" in reason.lower()
|
|
raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
|
|
except TimeoutError:
|
|
raise ProviderRequestError("You.com request timed out after 30s.", transient=True)
|
|
|
|
# Parse results
|
|
results_data = data.get("results", {})
|
|
web_results = results_data.get("web", [])
|
|
news_results = results_data.get("news", []) if include_news else []
|
|
metadata = data.get("metadata", {})
|
|
|
|
# Normalize web results
|
|
results = []
|
|
for i, item in enumerate(web_results[:max_results]):
|
|
snippets = item.get("snippets", [])
|
|
snippet = snippets[0] if snippets else item.get("description", "")
|
|
|
|
result = {
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": snippet,
|
|
"score": round(1.0 - i * 0.05, 3), # Assign descending score
|
|
"date": item.get("page_age"),
|
|
"source": "web",
|
|
}
|
|
|
|
# Include additional snippets if available (great for RAG)
|
|
if len(snippets) > 1:
|
|
result["additional_snippets"] = snippets[1:3]
|
|
|
|
# Include thumbnail and favicon for UI display
|
|
if item.get("thumbnail_url"):
|
|
result["thumbnail"] = item["thumbnail_url"]
|
|
if item.get("favicon_url"):
|
|
result["favicon"] = item["favicon_url"]
|
|
|
|
# Include live-crawled content if available
|
|
if item.get("contents"):
|
|
result["raw_content"] = item["contents"].get("markdown") or item["contents"].get("html", "")
|
|
|
|
results.append(result)
|
|
|
|
# Add news results (if any)
|
|
news = []
|
|
for item in news_results[:5]:
|
|
news.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": item.get("description", ""),
|
|
"date": item.get("page_age"),
|
|
"thumbnail": item.get("thumbnail_url"),
|
|
"source": "news",
|
|
})
|
|
|
|
# Build answer from best snippets
|
|
answer = ""
|
|
if results:
|
|
# Combine top snippets for LLM context
|
|
top_snippets = []
|
|
for r in results[:3]:
|
|
if r.get("snippet"):
|
|
top_snippets.append(r["snippet"])
|
|
answer = " ".join(top_snippets)[:1000]
|
|
|
|
return {
|
|
"provider": "you",
|
|
"query": query,
|
|
"results": results,
|
|
"news": news,
|
|
"images": [],
|
|
"answer": answer,
|
|
"metadata": {
|
|
"search_uuid": metadata.get("search_uuid"),
|
|
"latency": metadata.get("latency"),
|
|
}
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# SearXNG (Privacy-First Meta-Search)
|
|
# =============================================================================
|
|
|
|
def search_searxng(
|
|
query: str,
|
|
instance_url: str,
|
|
max_results: int = 5,
|
|
categories: Optional[List[str]] = None,
|
|
engines: Optional[List[str]] = None,
|
|
language: str = "en",
|
|
time_range: Optional[str] = None,
|
|
safesearch: int = 0,
|
|
) -> dict:
|
|
"""Search using SearXNG (self-hosted privacy-first meta-search).
|
|
|
|
SearXNG excels at:
|
|
- Privacy-preserving search (no tracking, no profiling)
|
|
- Multi-source aggregation (70+ upstream engines)
|
|
- $0 API cost (self-hosted)
|
|
- Diverse perspectives from multiple search engines
|
|
|
|
Args:
|
|
query: Search query
|
|
instance_url: URL of your SearXNG instance (required)
|
|
max_results: Maximum results to return (default 5)
|
|
categories: Search categories (general, images, news, videos, etc.)
|
|
engines: Specific engines to use (google, bing, duckduckgo, etc.)
|
|
language: Language code (e.g., en, de, fr)
|
|
time_range: Filter by recency: day, week, month, year
|
|
safesearch: Content filter: 0=off, 1=moderate, 2=strict
|
|
|
|
Note:
|
|
Requires a self-hosted SearXNG instance with JSON format enabled.
|
|
See: https://docs.searxng.org/admin/installation.html
|
|
"""
|
|
# Build URL with query parameters
|
|
params = {
|
|
"q": query,
|
|
"format": "json",
|
|
"language": language,
|
|
"safesearch": str(safesearch),
|
|
}
|
|
|
|
if categories:
|
|
params["categories"] = ",".join(categories)
|
|
if engines:
|
|
params["engines"] = ",".join(engines)
|
|
if time_range:
|
|
params["time_range"] = time_range
|
|
|
|
# Build URL — instance_url comes from operator-controlled config/env only
|
|
# (validated by _validate_searxng_url), not from agent/LLM input
|
|
base_url = instance_url.rstrip("/")
|
|
query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
|
|
url = f"{base_url}/search?{query_string}"
|
|
|
|
headers = {
|
|
"User-Agent": "ClawdBot-WebSearchPlus/2.5",
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
# Make GET request
|
|
req = Request(url, headers=headers, method="GET")
|
|
|
|
try:
|
|
with urlopen(req, timeout=30) as response:
|
|
data = json.loads(response.read().decode("utf-8"))
|
|
except HTTPError as e:
|
|
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
try:
|
|
error_json = json.loads(error_body)
|
|
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
except json.JSONDecodeError:
|
|
error_detail = error_body[:500]
|
|
|
|
error_messages = {
|
|
403: "JSON API disabled on this SearXNG instance. Enable 'json' in search.formats in settings.yml",
|
|
404: "SearXNG instance not found. Check your instance URL.",
|
|
500: "SearXNG server error. Check instance health.",
|
|
503: "SearXNG service unavailable."
|
|
}
|
|
friendly_msg = error_messages.get(e.code, f"SearXNG error: {error_detail}")
|
|
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
except URLError as e:
|
|
reason = str(getattr(e, "reason", e))
|
|
is_timeout = "timed out" in reason.lower()
|
|
raise ProviderRequestError(f"Cannot reach SearXNG instance at {instance_url}. Error: {reason}", transient=is_timeout)
|
|
except TimeoutError:
|
|
raise ProviderRequestError(f"SearXNG request timed out after 30s. Check instance health.", transient=True)
|
|
|
|
# Parse results
|
|
raw_results = data.get("results", [])
|
|
|
|
# Normalize results to unified format
|
|
results = []
|
|
engines_used = set()
|
|
for i, item in enumerate(raw_results[:max_results]):
|
|
engine = item.get("engine", "unknown")
|
|
engines_used.add(engine)
|
|
|
|
results.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"snippet": item.get("content", ""),
|
|
"score": round(item.get("score", 1.0 - i * 0.05), 3),
|
|
"engine": engine,
|
|
"category": item.get("category", "general"),
|
|
"date": item.get("publishedDate"),
|
|
})
|
|
|
|
# Build answer from answers, infoboxes, or first result
|
|
answer = ""
|
|
if data.get("answers"):
|
|
answer = data["answers"][0] if isinstance(data["answers"][0], str) else str(data["answers"][0])
|
|
elif data.get("infoboxes"):
|
|
infobox = data["infoboxes"][0]
|
|
answer = infobox.get("content", "") or infobox.get("infobox", "")
|
|
elif results:
|
|
answer = results[0]["snippet"]
|
|
|
|
return {
|
|
"provider": "searxng",
|
|
"query": query,
|
|
"results": results,
|
|
"images": [],
|
|
"answer": answer,
|
|
"suggestions": data.get("suggestions", []),
|
|
"corrections": data.get("corrections", []),
|
|
"metadata": {
|
|
"number_of_results": data.get("number_of_results"),
|
|
"engines_used": list(engines_used),
|
|
"instance_url": instance_url,
|
|
}
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
def main():
|
|
config = load_config()
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Web Search Plus — Intelligent multi-provider search with smart auto-routing",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Intelligent Auto-Routing:
|
|
The query is analyzed using multi-signal detection to find the optimal provider:
|
|
|
|
Shopping Intent → Serper (Google)
|
|
"how much", "price of", "buy", product+brand combos, deals, specs
|
|
|
|
Research Intent → Tavily
|
|
"how does", "explain", "what is", analysis, pros/cons, tutorials
|
|
|
|
Multilingual + Real-Time AI Search → Querit
|
|
multilingual search, metadata-rich results, current information for AI workflows
|
|
|
|
Discovery Intent → Exa (Neural)
|
|
"similar to", "companies like", "alternatives", URLs, startups, papers
|
|
|
|
Direct Answer Intent → Perplexity (via Kilo Gateway)
|
|
"what is", "current status", local events, synthesized up-to-date answers
|
|
|
|
Examples:
|
|
python3 search.py -q "iPhone 16 Pro Max price" # → Serper (shopping)
|
|
python3 search.py -q "how does HTTPS encryption work" # → Tavily (research)
|
|
python3 search.py -q "startups similar to Notion" # → Exa (discovery)
|
|
python3 search.py --explain-routing -q "your query" # Debug routing
|
|
|
|
Full docs: See README.md and SKILL.md
|
|
""",
|
|
)
|
|
|
|
# Common arguments
|
|
parser.add_argument(
|
|
"--provider", "-p",
|
|
choices=["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng", "auto"],
|
|
help="Search provider (auto=intelligent routing)"
|
|
)
|
|
parser.add_argument(
|
|
"--query", "-q",
|
|
help="Search query"
|
|
)
|
|
parser.add_argument(
|
|
"--max-results", "-n",
|
|
type=int,
|
|
default=config.get("defaults", {}).get("max_results", 5),
|
|
help="Maximum results (default: 5)"
|
|
)
|
|
parser.add_argument(
|
|
"--images",
|
|
action="store_true",
|
|
help="Include images (Serper/Tavily)"
|
|
)
|
|
|
|
# Auto-routing options
|
|
parser.add_argument(
|
|
"--auto", "-a",
|
|
action="store_true",
|
|
help="Use intelligent auto-routing (default when no provider specified)"
|
|
)
|
|
parser.add_argument(
|
|
"--explain-routing",
|
|
action="store_true",
|
|
help="Show detailed routing analysis (debug mode)"
|
|
)
|
|
|
|
# Serper-specific
|
|
serper_config = config.get("serper", {})
|
|
parser.add_argument("--country", default=serper_config.get("country", "us"))
|
|
parser.add_argument("--language", default=serper_config.get("language", "en"))
|
|
parser.add_argument(
|
|
"--type",
|
|
dest="search_type",
|
|
default=serper_config.get("type", "search"),
|
|
choices=["search", "news", "images", "videos", "places", "shopping"]
|
|
)
|
|
parser.add_argument(
|
|
"--time-range",
|
|
choices=["hour", "day", "week", "month", "year"]
|
|
)
|
|
|
|
# Tavily-specific
|
|
tavily_config = config.get("tavily", {})
|
|
parser.add_argument(
|
|
"--depth",
|
|
default=tavily_config.get("depth", "basic"),
|
|
choices=["basic", "advanced"]
|
|
)
|
|
parser.add_argument(
|
|
"--topic",
|
|
default=tavily_config.get("topic", "general"),
|
|
choices=["general", "news"]
|
|
)
|
|
parser.add_argument("--raw-content", action="store_true")
|
|
|
|
# Querit-specific
|
|
querit_config = config.get("querit", {})
|
|
parser.add_argument(
|
|
"--querit-base-url",
|
|
default=querit_config.get("base_url", "https://api.querit.ai"),
|
|
help="Querit API base URL"
|
|
)
|
|
parser.add_argument(
|
|
"--querit-base-path",
|
|
default=querit_config.get("base_path", "/v1/search"),
|
|
help="Querit API path"
|
|
)
|
|
|
|
# Exa-specific
|
|
exa_config = config.get("exa", {})
|
|
parser.add_argument(
|
|
"--exa-type",
|
|
default=exa_config.get("type", "neural"),
|
|
choices=["neural", "fast", "auto", "keyword", "instant"],
|
|
help="Exa search type (for standard search, ignored when --exa-depth is set)"
|
|
)
|
|
parser.add_argument(
|
|
"--exa-depth",
|
|
default=exa_config.get("depth", "normal"),
|
|
choices=["normal", "deep", "deep-reasoning"],
|
|
help="Exa search depth: deep (synthesized, 4-12s), deep-reasoning (cross-reference, 12-50s)"
|
|
)
|
|
parser.add_argument(
|
|
"--exa-verbosity",
|
|
default=exa_config.get("verbosity", "standard"),
|
|
choices=["compact", "standard", "full"],
|
|
help="Exa text verbosity for content extraction"
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
choices=[
|
|
"company", "research paper", "news", "pdf", "github",
|
|
"tweet", "personal site", "linkedin profile"
|
|
]
|
|
)
|
|
parser.add_argument("--start-date")
|
|
parser.add_argument("--end-date")
|
|
parser.add_argument("--similar-url")
|
|
|
|
# You.com-specific
|
|
you_config = config.get("you", {})
|
|
parser.add_argument(
|
|
"--you-safesearch",
|
|
default=you_config.get("safesearch", "moderate"),
|
|
choices=["off", "moderate", "strict"],
|
|
help="You.com SafeSearch filter"
|
|
)
|
|
parser.add_argument(
|
|
"--freshness",
|
|
choices=["day", "week", "month", "year"],
|
|
help="Filter results by recency (You.com/Serper)"
|
|
)
|
|
parser.add_argument(
|
|
"--livecrawl",
|
|
choices=["web", "news", "all"],
|
|
help="You.com: fetch full page content"
|
|
)
|
|
parser.add_argument(
|
|
"--no-news",
|
|
action="store_true",
|
|
help="You.com: exclude news results (included by default)"
|
|
)
|
|
|
|
# SearXNG-specific
|
|
searxng_config = config.get("searxng", {})
|
|
parser.add_argument(
|
|
"--searxng-url",
|
|
default=searxng_config.get("instance_url"),
|
|
help="SearXNG instance URL (e.g., https://searx.example.com)"
|
|
)
|
|
parser.add_argument(
|
|
"--searxng-safesearch",
|
|
type=int,
|
|
default=searxng_config.get("safesearch", 0),
|
|
choices=[0, 1, 2],
|
|
help="SearXNG SafeSearch: 0=off, 1=moderate, 2=strict"
|
|
)
|
|
parser.add_argument(
|
|
"--engines",
|
|
nargs="+",
|
|
default=searxng_config.get("engines"),
|
|
help="SearXNG: specific engines to use (e.g., google bing duckduckgo)"
|
|
)
|
|
parser.add_argument(
|
|
"--categories",
|
|
nargs="+",
|
|
help="SearXNG: search categories (general, images, news, videos, etc.)"
|
|
)
|
|
|
|
# Domain filters
|
|
parser.add_argument("--include-domains", nargs="+")
|
|
parser.add_argument("--exclude-domains", nargs="+")
|
|
|
|
# Output
|
|
parser.add_argument("--compact", action="store_true")
|
|
|
|
# Caching options
|
|
parser.add_argument(
|
|
"--cache-ttl",
|
|
type=int,
|
|
default=DEFAULT_CACHE_TTL,
|
|
help=f"Cache TTL in seconds (default: {DEFAULT_CACHE_TTL} = 1 hour)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-cache",
|
|
action="store_true",
|
|
help="Bypass cache (always fetch fresh results)"
|
|
)
|
|
parser.add_argument(
|
|
"--clear-cache",
|
|
action="store_true",
|
|
help="Clear all cached results and exit"
|
|
)
|
|
parser.add_argument(
|
|
"--cache-stats",
|
|
action="store_true",
|
|
help="Show cache statistics and exit"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Handle cache management commands first (before query validation)
|
|
if args.clear_cache:
|
|
result = cache_clear()
|
|
indent = None if args.compact else 2
|
|
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
return
|
|
|
|
if args.cache_stats:
|
|
result = cache_stats()
|
|
indent = None if args.compact else 2
|
|
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
return
|
|
|
|
if not args.query and not args.similar_url:
|
|
parser.error("--query is required (unless using --similar-url with Exa)")
|
|
|
|
# Handle --explain-routing
|
|
if args.explain_routing:
|
|
if not args.query:
|
|
parser.error("--query is required for --explain-routing")
|
|
explanation = explain_routing(args.query, config)
|
|
indent = None if args.compact else 2
|
|
print(json.dumps(explanation, indent=indent, ensure_ascii=False))
|
|
return
|
|
|
|
# Determine provider
|
|
if args.provider == "auto" or (args.provider is None and not args.similar_url):
|
|
if args.query:
|
|
routing = auto_route_provider(args.query, config)
|
|
provider = routing["provider"]
|
|
routing_info = {
|
|
"auto_routed": True,
|
|
"provider": provider,
|
|
"confidence": routing["confidence"],
|
|
"confidence_level": routing["confidence_level"],
|
|
"reason": routing["reason"],
|
|
"top_signals": routing["top_signals"],
|
|
"scores": routing["scores"],
|
|
}
|
|
else:
|
|
provider = "exa"
|
|
routing_info = {
|
|
"auto_routed": True,
|
|
"provider": "exa",
|
|
"confidence": 1.0,
|
|
"confidence_level": "high",
|
|
"reason": "similar_url_specified",
|
|
}
|
|
else:
|
|
provider = args.provider or "serper"
|
|
routing_info = {"auto_routed": False, "provider": provider}
|
|
|
|
# Build provider fallback list
|
|
auto_config = config.get("auto_routing", {})
|
|
provider_priority = auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"])
|
|
disabled_providers = auto_config.get("disabled_providers", [])
|
|
|
|
# Start with the selected provider, then try others in priority order
|
|
# Only include providers that have a configured API key (except the primary,
|
|
# which gets a clear error if unconfigured and no fallback succeeds)
|
|
providers_to_try = [provider]
|
|
for p in provider_priority:
|
|
if p not in providers_to_try and p not in disabled_providers and get_api_key(p, config):
|
|
providers_to_try.append(p)
|
|
|
|
# Skip providers currently in cooldown
|
|
eligible_providers = []
|
|
cooldown_skips = []
|
|
for p in providers_to_try:
|
|
in_cd, remaining = provider_in_cooldown(p)
|
|
if in_cd:
|
|
cooldown_skips.append({"provider": p, "cooldown_remaining_seconds": remaining})
|
|
else:
|
|
eligible_providers.append(p)
|
|
|
|
if not eligible_providers:
|
|
eligible_providers = providers_to_try[:1]
|
|
|
|
# Helper function to execute search for a provider
|
|
def execute_search(prov: str) -> Dict[str, Any]:
|
|
key = validate_api_key(prov, config)
|
|
if prov == "serper":
|
|
return search_serper(
|
|
query=args.query,
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
country=args.country,
|
|
language=args.language,
|
|
search_type=args.search_type,
|
|
time_range=args.time_range,
|
|
include_images=args.images,
|
|
)
|
|
elif prov == "tavily":
|
|
return search_tavily(
|
|
query=args.query,
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
depth=args.depth,
|
|
topic=args.topic,
|
|
include_domains=args.include_domains,
|
|
exclude_domains=args.exclude_domains,
|
|
include_images=args.images,
|
|
include_raw_content=args.raw_content,
|
|
)
|
|
elif prov == "querit":
|
|
return search_querit(
|
|
query=args.query,
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
language=args.language,
|
|
country=args.country,
|
|
time_range=args.time_range or args.freshness,
|
|
include_domains=args.include_domains,
|
|
exclude_domains=args.exclude_domains,
|
|
base_url=args.querit_base_url,
|
|
base_path=args.querit_base_path,
|
|
timeout=int(querit_config.get("timeout", 30)),
|
|
)
|
|
elif prov == "exa":
|
|
# CLI --exa-depth overrides; fallback to auto-routing suggestion
|
|
exa_depth = args.exa_depth
|
|
if exa_depth == "normal" and routing_info.get("exa_depth") in ("deep", "deep-reasoning"):
|
|
exa_depth = routing_info["exa_depth"]
|
|
return search_exa(
|
|
query=args.query or "",
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
search_type=args.exa_type,
|
|
exa_depth=exa_depth,
|
|
category=args.category,
|
|
start_date=args.start_date,
|
|
end_date=args.end_date,
|
|
similar_url=args.similar_url,
|
|
include_domains=args.include_domains,
|
|
exclude_domains=args.exclude_domains,
|
|
text_verbosity=args.exa_verbosity,
|
|
)
|
|
elif prov == "perplexity":
|
|
perplexity_config = config.get("perplexity", {})
|
|
return search_perplexity(
|
|
query=args.query,
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
model=perplexity_config.get("model", "perplexity/sonar-pro"),
|
|
api_url=perplexity_config.get("api_url", "https://api.kilo.ai/api/gateway/chat/completions"),
|
|
freshness=getattr(args, "freshness", None),
|
|
)
|
|
elif prov == "you":
|
|
return search_you(
|
|
query=args.query,
|
|
api_key=key,
|
|
max_results=args.max_results,
|
|
country=args.country,
|
|
language=args.language,
|
|
freshness=args.freshness,
|
|
safesearch=args.you_safesearch,
|
|
include_news=not args.no_news,
|
|
livecrawl=args.livecrawl,
|
|
)
|
|
elif prov == "searxng":
|
|
# For SearXNG, 'key' is actually the instance URL
|
|
instance_url = args.searxng_url or key
|
|
if instance_url:
|
|
instance_url = _validate_searxng_url(instance_url)
|
|
return search_searxng(
|
|
query=args.query,
|
|
instance_url=instance_url,
|
|
max_results=args.max_results,
|
|
categories=args.categories,
|
|
engines=args.engines,
|
|
language=args.language,
|
|
time_range=args.time_range,
|
|
safesearch=args.searxng_safesearch,
|
|
)
|
|
else:
|
|
raise ValueError(f"Unknown provider: {prov}")
|
|
|
|
def execute_with_retry(prov: str) -> Dict[str, Any]:
|
|
last_error = None
|
|
for attempt in range(0, 3):
|
|
try:
|
|
return execute_search(prov)
|
|
except ProviderRequestError as e:
|
|
last_error = e
|
|
if e.status_code in {401, 403}:
|
|
break
|
|
if not e.transient:
|
|
break
|
|
if attempt < 2:
|
|
time.sleep(RETRY_BACKOFF_SECONDS[attempt])
|
|
continue
|
|
break
|
|
except Exception as e:
|
|
last_error = e
|
|
break
|
|
raise last_error if last_error else Exception("Unknown provider execution error")
|
|
|
|
cache_context = {
|
|
"locale": f"{args.country}:{args.language}",
|
|
"freshness": args.freshness,
|
|
"time_range": args.time_range,
|
|
"include_domains": sorted(args.include_domains) if args.include_domains else None,
|
|
"exclude_domains": sorted(args.exclude_domains) if args.exclude_domains else None,
|
|
"topic": args.topic,
|
|
"search_engines": sorted(args.engines) if args.engines else None,
|
|
"include_news": not args.no_news,
|
|
"search_type": args.search_type,
|
|
"exa_type": args.exa_type,
|
|
"exa_depth": args.exa_depth,
|
|
"exa_verbosity": args.exa_verbosity,
|
|
"category": args.category,
|
|
"similar_url": args.similar_url,
|
|
}
|
|
|
|
# Check cache first (unless --no-cache is set)
|
|
cached_result = None
|
|
cache_hit = False
|
|
if not args.no_cache and args.query:
|
|
cached_result = cache_get(
|
|
query=args.query,
|
|
provider=provider,
|
|
max_results=args.max_results,
|
|
ttl=args.cache_ttl,
|
|
params=cache_context,
|
|
)
|
|
if cached_result:
|
|
cache_hit = True
|
|
result = {k: v for k, v in cached_result.items() if not k.startswith("_cache_")}
|
|
result["cached"] = True
|
|
result["cache_age_seconds"] = int(time.time() - cached_result.get("_cache_timestamp", 0))
|
|
|
|
errors = []
|
|
successful_provider = None
|
|
successful_results: List[Tuple[str, Dict[str, Any]]] = []
|
|
result = None if not cache_hit else result
|
|
|
|
for idx, current_provider in enumerate(eligible_providers):
|
|
if cache_hit:
|
|
successful_provider = provider
|
|
break
|
|
try:
|
|
provider_result = execute_with_retry(current_provider)
|
|
reset_provider_health(current_provider)
|
|
successful_results.append((current_provider, provider_result))
|
|
successful_provider = current_provider
|
|
|
|
# If we have enough results, stop.
|
|
if len(provider_result.get("results", [])) >= args.max_results:
|
|
break
|
|
|
|
# Only continue collecting from lower-priority providers when fallback was needed.
|
|
if not errors:
|
|
break
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
cooldown_info = mark_provider_failure(current_provider, error_msg)
|
|
errors.append({
|
|
"provider": current_provider,
|
|
"error": error_msg,
|
|
"cooldown_seconds": cooldown_info.get("cooldown_seconds"),
|
|
})
|
|
if len(eligible_providers) > 1:
|
|
remaining = eligible_providers[idx + 1:]
|
|
if remaining:
|
|
print(json.dumps({
|
|
"fallback": True,
|
|
"failed_provider": current_provider,
|
|
"error": error_msg,
|
|
"trying_next": remaining[0],
|
|
}), file=sys.stderr)
|
|
continue
|
|
|
|
if successful_results:
|
|
if len(successful_results) == 1:
|
|
result = successful_results[0][1]
|
|
else:
|
|
primary = successful_results[0][1].copy()
|
|
deduped_results, dedup_count = deduplicate_results_across_providers(successful_results, args.max_results)
|
|
primary["results"] = deduped_results
|
|
primary["deduplicated"] = dedup_count > 0
|
|
primary.setdefault("metadata", {})
|
|
primary["metadata"]["dedup_count"] = dedup_count
|
|
primary["metadata"]["providers_merged"] = [p for p, _ in successful_results]
|
|
result = primary
|
|
|
|
if result is not None:
|
|
if successful_provider != provider:
|
|
routing_info["fallback_used"] = True
|
|
routing_info["original_provider"] = provider
|
|
routing_info["provider"] = successful_provider
|
|
routing_info["fallback_errors"] = errors
|
|
|
|
if cooldown_skips:
|
|
routing_info["cooldown_skips"] = cooldown_skips
|
|
|
|
result["routing"] = routing_info
|
|
|
|
if not cache_hit and not args.no_cache and args.query:
|
|
cache_put(
|
|
query=args.query,
|
|
provider=successful_provider or provider,
|
|
max_results=args.max_results,
|
|
result=result,
|
|
params=cache_context,
|
|
)
|
|
|
|
result["cached"] = bool(cache_hit)
|
|
if "deduplicated" not in result:
|
|
result["deduplicated"] = False
|
|
result.setdefault("metadata", {})
|
|
result["metadata"].setdefault("dedup_count", 0)
|
|
|
|
indent = None if args.compact else 2
|
|
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
else:
|
|
error_result = {
|
|
"error": "All providers failed",
|
|
"provider": provider,
|
|
"query": args.query,
|
|
"routing": routing_info,
|
|
"provider_errors": errors,
|
|
"cooldown_skips": cooldown_skips,
|
|
}
|
|
print(json.dumps(error_result, indent=2), file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|