Files
ide-rea_baidu-baike-data/scripts/baidu_baike.py

114 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Baidu Baike Query Script
Query encyclopedia entries from Baidu Baike.
"""
import os
import sys
import requests
import json
import argparse
from typing import Dict, Any, List
class BaiduBaikeClient:
"""Baidu Baike API Client"""
BASE_URL = "https://appbuilder.baidu.com/v2/baike"
def __init__(self, api_key: str):
self.api_key = api_key
self.headers = {
"Authorization": f"Bearer {api_key}",
"X-Appbuilder-From": "openclaw",
}
def get_lemma_content(self, search_type: str, search_key: str) -> Dict[str, Any]:
"""Get detailed entry content by title or ID."""
url = f"{self.BASE_URL}/lemma/get_content"
params = {"search_type": search_type, "search_key": search_key}
response = requests.get(url, params=params, headers=self.headers, timeout=30)
response.raise_for_status()
result = response.json()
self._check_error(result)
if "result" in result:
# Remove large fields to reduce output size
exclude_keys = {"summary", "abstract_html", "abstract_structured",
"square_pic_url_wap", "videos", "relations", "star_map"}
return {k: v for k, v in result["result"].items()
if k not in exclude_keys and v is not None}
return {}
def get_lemma_list(self, lemma_title: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""List entries with same title (for homonym resolution)."""
url = f"{self.BASE_URL}/lemma/get_list_by_title"
params = {"lemma_title": lemma_title, "top_k": top_k}
response = requests.get(url, params=params, headers=self.headers, timeout=30)
response.raise_for_status()
result = response.json()
self._check_error(result)
return result.get("result", [])
def _check_error(self, result: Dict[str, Any]) -> None:
if "errno" in result and result["errno"] != 0:
errmsg = result.get("errmsg", "Unknown error")
raise RuntimeError(f"API error: {errmsg} (code: {result['errno']})")
def main():
parser = argparse.ArgumentParser(description="Query Baidu Baike entries")
parser.add_argument(
"--search_type", "-st",
required=True,
choices=["lemmaTitle", "lemmaId", "lemmaList"],
help="Search type: lemmaTitle, lemmaId, or lemmaList"
)
parser.add_argument(
"--search_key", "-sk",
required=True,
help="Search keyword (entry title or ID)"
)
parser.add_argument(
"--top_k", "-tk",
type=int,
default=5,
help="Max results for lemmaList (default: 5)"
)
args = parser.parse_args()
api_key = os.getenv("BAIDU_API_KEY")
if not api_key:
print("Error: BAIDU_API_KEY environment variable not set", file=sys.stderr)
sys.exit(1)
try:
client = BaiduBaikeClient(api_key)
if args.search_type == "lemmaList":
results = client.get_lemma_list(args.search_key, args.top_k)
else:
results = client.get_lemma_content(args.search_type, args.search_key)
print(json.dumps(results, ensure_ascii=False, indent=2))
except requests.exceptions.RequestException as e:
print(f"Network error: {e}", file=sys.stderr)
sys.exit(1)
except RuntimeError as e:
print(f"API error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()