Files
robbyczgw-cla_topic-monitor/scripts/monitor.py

673 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Proactive Research Monitor
Checks topics due for monitoring, scores findings, and sends alerts.
Run via cron for automated monitoring.
"""
import os
import sys
import json
import hashlib
import argparse
import re
from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urljoin
from urllib.request import Request, urlopen
SCRIPT_DIR = Path(__file__).parent
VENDOR_DIR = SCRIPT_DIR / "_vendor"
sys.path.insert(0, str(SCRIPT_DIR))
if VENDOR_DIR.exists():
sys.path.insert(0, str(VENDOR_DIR))
from config import (
load_config, load_state, save_state, get_settings,
save_finding, queue_alert
)
from importance_scorer import score_result
try:
import feedparser
except ImportError: # pragma: no cover - handled at runtime
feedparser = None
class FeedDiscoveryParser(HTMLParser):
"""Extract RSS/Atom alternate links from HTML."""
def __init__(self):
super().__init__()
self.feed_links = []
def handle_starttag(self, tag, attrs):
if tag.lower() != "link":
return
attr_map = {k.lower(): v for k, v in attrs}
rel = (attr_map.get("rel") or "").lower()
href = attr_map.get("href")
content_type = (attr_map.get("type") or "").lower()
if not href:
return
if "alternate" in rel and content_type in (
"application/rss+xml",
"application/atom+xml",
"application/xml",
"text/xml",
):
self.feed_links.append(href)
def hash_url(url: str) -> str:
return hashlib.md5(url.encode()).hexdigest()
def normalize_text_list(values: Optional[List[str]]) -> List[str]:
if not values:
return []
out = []
for value in values:
if value is None:
continue
text = str(value).strip()
if text:
out.append(text)
return out
def normalize_feed_url(url: str) -> str:
url = (url or "").strip()
if not url:
return ""
if "github.com" in url and "/releases" not in url and url.count("/") >= 4:
parts = url.rstrip("/").split("/")
if len(parts) >= 5:
owner = parts[-2]
repo = parts[-1]
return f"https://github.com/{owner}/{repo}/releases.atom"
return url
def github_release_feed_url(repo: str) -> str:
repo = repo.strip().strip("/")
return f"https://github.com/{repo}/releases.atom"
def parse_http_date(value: Optional[str]) -> Optional[str]:
if not value:
return None
try:
return parsedate_to_datetime(value).astimezone(timezone.utc).isoformat()
except Exception:
return None
def discover_feed_urls(url: str, timeout: int = 15) -> List[str]:
"""Try to discover RSS/Atom feeds from a regular webpage URL."""
url = (url or "").strip()
if not url:
return []
if url.endswith((".rss", ".xml", ".atom")) or url.endswith("/feed"):
return [url]
request = Request(
url,
headers={
"User-Agent": "topic-monitor/1.5 (+feed-discovery)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
)
try:
with urlopen(request, timeout=timeout) as response:
content_type = response.headers.get("Content-Type", "")
final_url = response.geturl()
body = response.read(250_000)
except Exception:
return []
lowered_type = content_type.lower()
if "rss" in lowered_type or "atom" in lowered_type or "xml" in lowered_type:
return [final_url]
try:
html = body.decode("utf-8", errors="ignore")
except Exception:
return []
parser = FeedDiscoveryParser()
parser.feed(html)
discovered = [urljoin(final_url, href) for href in parser.feed_links]
common_guesses = [
urljoin(final_url, "/feed"),
urljoin(final_url, "/rss"),
urljoin(final_url, "/rss.xml"),
urljoin(final_url, "/atom.xml"),
urljoin(final_url, "/feeds/posts/default"),
]
seen = set()
ordered = []
for candidate in discovered + common_guesses:
candidate = candidate.strip()
if candidate and candidate not in seen:
seen.add(candidate)
ordered.append(candidate)
return ordered
def is_duplicate(url: str, state: Dict, dedup_hours: int = 72) -> bool:
url_hash = hash_url(url)
dedup_map = state.get("deduplication", {}).get("url_hash_map", {})
if url_hash not in dedup_map:
return False
last_seen = datetime.fromisoformat(dedup_map[url_hash])
return (datetime.now() - last_seen) < timedelta(hours=dedup_hours)
def mark_as_seen(url: str, state: Dict):
if "deduplication" not in state:
state["deduplication"] = {"url_hash_map": {}}
state["deduplication"]["url_hash_map"][hash_url(url)] = datetime.now().isoformat()
def get_topic_state(state: Dict, topic_id: str) -> Dict:
if "topics" not in state:
state["topics"] = {}
if topic_id not in state["topics"]:
state["topics"][topic_id] = {}
return state["topics"][topic_id]
def get_feed_state(state: Dict, topic_id: str) -> Dict:
topic_state = get_topic_state(state, topic_id)
if "feeds" not in topic_state:
topic_state["feeds"] = {}
return topic_state["feeds"]
def feed_cache_headers(state: Dict, topic_id: str, feed_url: str) -> Dict[str, str]:
feed_state = get_feed_state(state, topic_id).get(feed_url, {})
headers = {"User-Agent": "topic-monitor/1.5 (+feedparser)"}
if feed_state.get("etag"):
headers["If-None-Match"] = feed_state["etag"]
if feed_state.get("modified"):
headers["If-Modified-Since"] = feed_state["modified"]
return headers
def update_feed_cache(state: Dict, topic_id: str, feed_url: str, parsed_feed):
cache = get_feed_state(state, topic_id).setdefault(feed_url, {})
if getattr(parsed_feed, "etag", None):
cache["etag"] = parsed_feed.etag
response_headers = getattr(parsed_feed, "headers", {}) or {}
if response_headers.get("etag"):
cache["etag"] = response_headers.get("etag")
if response_headers.get("last-modified"):
cache["modified"] = response_headers.get("last-modified")
cache["last_checked"] = datetime.now().isoformat()
if getattr(parsed_feed, "status", None) is not None:
cache["last_status"] = parsed_feed.status
href = getattr(parsed_feed.feed, "link", None) if getattr(parsed_feed, "feed", None) else None
if href:
cache["site_url"] = href
title = getattr(parsed_feed.feed, "title", None) if getattr(parsed_feed, "feed", None) else None
if title:
cache["feed_title"] = title
def iso_from_struct_time(struct_time_value) -> Optional[str]:
if not struct_time_value:
return None
try:
return datetime(*struct_time_value[:6], tzinfo=timezone.utc).isoformat()
except Exception:
return None
def entry_to_result(entry: Dict, feed_url: str, topic: Dict, feed_title: str = "") -> Dict:
title = entry.get("title", "Untitled")
summary = entry.get("summary", "") or entry.get("description", "")
link = entry.get("link", feed_url)
published = (
iso_from_struct_time(entry.get("published_parsed"))
or iso_from_struct_time(entry.get("updated_parsed"))
or entry.get("published")
or entry.get("updated")
or ""
)
tags = ", ".join(tag.get("term", "") for tag in entry.get("tags", []) if tag.get("term"))
source_label = feed_title or topic.get("name", "Feed")
result = {
"title": title,
"url": link,
"snippet": re.sub(r"\s+", " ", summary).strip(),
"published_date": published,
"source": "feed",
"feed_url": feed_url,
"feed_title": feed_title,
"source_label": source_label,
"tags": tags,
}
if "github.com" in feed_url and "/releases.atom" in feed_url:
repo = feed_url.split("github.com/")[-1].split("/releases.atom")[0]
result["source"] = "github_release"
result["github_repo"] = repo
result["title"] = f"{repo} release: {title}"
if not result["snippet"]:
result["snippet"] = f"New GitHub release published for {repo}."
return result
def search_topic(topic: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
query = topic.get("query", "")
if not query:
return []
web_search_plus = Path(os.environ.get(
"WEB_SEARCH_PLUS_PATH",
os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
"web-search-plus",
"scripts",
"search.py",
),
))
if web_search_plus.exists():
import subprocess
try:
safe_query = re.sub(r'[\x00-\x1f\x7f]', '', query)[:500]
if verbose:
print(f" 🔍 Searching via web-search-plus: {safe_query}")
result = subprocess.run(
["python3", str(web_search_plus), "--query", safe_query, "--max-results", "5"],
capture_output=True,
text=True,
timeout=45,
env={k: v for k, v in os.environ.items() if k in (
"PATH", "HOME", "LANG", "TERM",
"SERPER_API_KEY", "TAVILY_API_KEY", "EXA_API_KEY",
"YOU_API_KEY", "SEARXNG_INSTANCE_URL", "WSP_CACHE_DIR",
)},
)
if result.returncode == 0:
stdout = result.stdout.strip()
json_start = stdout.find('{')
if json_start >= 0:
data = json.loads(stdout[json_start:])
results = data.get("results", [])
if verbose:
print(f" ✅ Got {len(results)} search results from {data.get('provider', 'unknown')}")
for item in results:
item.setdefault("source", "web_search")
return results
elif verbose and result.stderr:
print(f" ⚠️ web-search-plus error: {result.stderr[:200]}", file=sys.stderr)
except subprocess.TimeoutExpired:
print(f"⚠️ web-search-plus timed out for query: {query}", file=sys.stderr)
except Exception as e:
print(f"⚠️ web-search-plus failed: {e}", file=sys.stderr)
elif verbose:
print(f" ⚠️ web-search-plus not found at {web_search_plus}", file=sys.stderr)
if dry_run:
return [{
"title": f"[Mock] Result for: {query}",
"url": f"https://example.com/mock-{hashlib.md5(query.encode()).hexdigest()[:8]}",
"snippet": f"This is a mock/test result for query: {query}. Run without --dry-run to use real search.",
"published_date": datetime.now().isoformat(),
"source": "web_search",
}]
return []
def fetch_feed_results(topic: Dict, state: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
if feedparser is None:
if verbose:
print(" ⚠️ feedparser is not installed; skipping feeds", file=sys.stderr)
return []
topic_id = topic.get("id")
explicit_feeds = [normalize_feed_url(url) for url in normalize_text_list(topic.get("feeds", []))]
github_feeds = [github_release_feed_url(repo) for repo in normalize_text_list(topic.get("github_repos", []))]
all_feeds = []
for feed in explicit_feeds + github_feeds:
if feed and feed not in all_feeds:
all_feeds.append(feed)
if not all_feeds:
return []
results = []
for feed_url in all_feeds:
headers = feed_cache_headers(state, topic_id, feed_url)
if verbose:
print(f" 📰 Fetching feed: {feed_url}")
parsed = feedparser.parse(feed_url, request_headers=headers)
update_feed_cache(state, topic_id, feed_url, parsed)
status = getattr(parsed, "status", None)
if status == 304:
if verbose:
print(" ⏭️ Feed not modified (304)")
continue
if getattr(parsed, "bozo", False) and not getattr(parsed, "entries", []):
if verbose:
print(f" ⚠️ feed parse issue for {feed_url}: {getattr(parsed, 'bozo_exception', 'unknown')}", file=sys.stderr)
continue
feed_title = parsed.feed.get("title", "") if getattr(parsed, "feed", None) else ""
entries = getattr(parsed, "entries", [])[:10]
if verbose:
print(f" ✅ Got {len(entries)} feed entries")
for entry in entries:
results.append(entry_to_result(entry, feed_url, topic, feed_title=feed_title))
return results
def collect_results(topic: Dict, state: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
results = []
results.extend(search_topic(topic, dry_run=dry_run, verbose=verbose))
results.extend(fetch_feed_results(topic, state, dry_run=dry_run, verbose=verbose))
return results
def passes_topic_filters(result: Dict, topic: Dict) -> (bool, str):
title = result.get("title", "")
snippet = result.get("snippet", "")
content = f"{title}\n{snippet}".lower()
exclude_keywords = [k.lower() for k in normalize_text_list(topic.get("exclude_keywords", []))]
for keyword in exclude_keywords:
if keyword in content:
return False, f"excluded_by_{keyword}"
required_keywords = [k.lower() for k in normalize_text_list(topic.get("required_keywords", []))]
missing = [keyword for keyword in required_keywords if keyword not in content]
if missing:
return False, f"missing_required_{','.join(missing)}"
return True, ""
def should_check_topic(topic: Dict, state: Dict, force: bool = False) -> bool:
if force:
return True
topic_state = state.get("topics", {}).get(topic.get("id"), {})
last_check_str = topic_state.get("last_check")
if not last_check_str:
return True
last_check = datetime.fromisoformat(last_check_str)
now = datetime.now()
frequency = topic.get("frequency", "daily")
if frequency == "hourly":
return (now - last_check) >= timedelta(hours=1)
if frequency == "daily":
return (now - last_check) >= timedelta(days=1)
if frequency == "weekly":
return (now - last_check) >= timedelta(weeks=1)
return False
def check_rate_limits(topic: Dict, state: Dict, settings: Dict) -> bool:
topic_id = topic.get("id")
max_per_day = settings.get("max_alerts_per_day", 5)
max_per_topic_per_day = settings.get("max_alerts_per_topic_per_day", 2)
topic_state = state.get("topics", {}).get(topic_id, {})
alerts_today = topic_state.get("alerts_today", 0)
if alerts_today >= max_per_topic_per_day:
return False
total_alerts_today = sum(s.get("alerts_today", 0) for s in state.get("topics", {}).values())
return total_alerts_today < max_per_day
def sentiment_shifted(topic: Dict, state: Dict, new_sentiment: str) -> bool:
if not topic.get("alert_on_sentiment_shift"):
return False
history = get_topic_state(state, topic.get("id")).get("sentiment_history", [])
if not history:
return False
previous = history[-1].get("sentiment")
return bool(previous and previous != new_sentiment)
def record_sentiment(topic: Dict, state: Dict, result: Dict, sentiment: str):
topic_state = get_topic_state(state, topic.get("id"))
history = topic_state.setdefault("sentiment_history", [])
history.append({
"timestamp": datetime.now().isoformat(),
"sentiment": sentiment,
"url": result.get("url", ""),
"title": result.get("title", ""),
})
if len(history) > 50:
del history[:-50]
topic_state["last_sentiment"] = sentiment
def build_alert_message(topic: Dict, result: Dict, priority: str, score: float, reason: str, sentiment: str, sentiment_shift: bool) -> str:
emoji_map = {"high": "🔥", "medium": "📌", "low": "📝"}
source = result.get("source", "web_search")
source_label = {
"web_search": "🌐 Web",
"feed": "📰 Feed",
"github_release": "🚀 GitHub Release",
}.get(source, "🌐 Web")
lines = [f"{emoji_map.get(priority, '📌')} **{topic.get('name', 'Research Alert')}** {topic.get('emoji', '🔍')}", ""]
lines.append(f"**{result.get('title', 'Untitled')}**")
lines.append("")
snippet = result.get("snippet", "")
if snippet:
if len(snippet) > 320:
snippet = snippet[:317] + "..."
lines.append(snippet)
lines.append("")
context = topic.get("context", "")
if context:
lines.append(f"💡 _Context: {context}_")
lines.append("")
lines.append(f"{source_label}: {result.get('feed_title') or result.get('source_label') or result.get('url', '')}")
if result.get("url"):
lines.append(f"🔗 {result['url']}")
lines.append(f"📊 _Score: {score:.2f} | {reason}_")
lines.append(f"🙂 _Sentiment: {sentiment}_")
if sentiment_shift:
lines.append("🔄 _Sentiment shift detected_")
return "\n".join(lines)
def send_alert(topic: Dict, result: Dict, priority: str, score: float, reason: str, sentiment: str, sentiment_shift: bool = False, dry_run: bool = False):
channels = topic.get("channels", [])
message = build_alert_message(topic, result, priority, score, reason, sentiment, sentiment_shift)
if dry_run:
print(f"\n{'='*60}")
print("DRY RUN - Would send alert:")
print(f"Channels: {', '.join(channels)}")
print(f"Priority: {priority.upper()}")
print()
print(message)
print(f"{'='*60}\n")
return None
alert_ids = []
for channel in channels:
alert_data = {
"timestamp": datetime.now().isoformat(),
"priority": priority,
"channel": channel,
"topic_id": topic.get("id"),
"topic_name": topic.get("name"),
"title": result.get("title", ""),
"snippet": result.get("snippet", ""),
"url": result.get("url", ""),
"score": score,
"reason": reason,
"message": message,
"context": topic.get("context", ""),
"sentiment": sentiment,
"sentiment_shift": sentiment_shift,
"source": result.get("source", "web_search"),
"feed_url": result.get("feed_url", ""),
"github_repo": result.get("github_repo", ""),
}
alert_id = queue_alert(alert_data)
alert_ids.append(alert_id)
print(f"📢 ALERT_QUEUED: {json.dumps({'id': alert_id, 'channel': channel, 'priority': priority, 'topic': topic.get('name'), 'sentiment': sentiment})}")
return alert_ids
def monitor_topic(topic: Dict, state: Dict, settings: Dict, dry_run: bool = False, verbose: bool = False):
topic_id = topic.get("id")
topic_name = topic.get("name")
if verbose:
print(f"\n🔍 Checking topic: {topic_name} ({topic_id})")
results = collect_results(topic, state, dry_run=dry_run, verbose=verbose)
if verbose:
print(f" Found {len(results)} total results across all sources")
dedup_hours = settings.get("deduplication_window_hours", 72)
high_priority = []
medium_priority = []
for result in results:
url = result.get("url", "") or result.get("feed_url", "")
if not url:
continue
if is_duplicate(url, state, dedup_hours):
if verbose:
print(f" ⏭️ Skipping duplicate: {url}")
continue
passes, filter_reason = passes_topic_filters(result, topic)
if not passes:
if verbose:
print(f" 🚫 Filtered out: {filter_reason} - {result.get('title', '')[:60]}")
mark_as_seen(url, state)
continue
priority, score, reason, sentiment = score_result(result, topic, settings)
sentiment_shift = sentiment_shifted(topic, state, sentiment)
if sentiment_shift and priority == "medium":
priority = "high"
reason = f"{reason} + sentiment_shift"
elif sentiment_shift and priority == "low":
priority = "medium"
reason = f"{reason} + sentiment_shift"
if verbose:
print(f" {priority.upper():6} ({score:.2f}) [{sentiment}] - {result.get('title', '')[:55]}...")
if priority == "high":
high_priority.append((result, score, reason, sentiment, sentiment_shift))
elif priority == "medium":
medium_priority.append((result, score, reason, sentiment, sentiment_shift))
mark_as_seen(url, state)
if not dry_run:
record_sentiment(topic, state, result, sentiment)
for result, score, reason, sentiment, sentiment_shift in high_priority:
if check_rate_limits(topic, state, settings):
send_alert(topic, result, "high", score, reason, sentiment, sentiment_shift, dry_run=dry_run)
if not dry_run:
topic_state = get_topic_state(state, topic_id)
topic_state["alerts_today"] = topic_state.get("alerts_today", 0) + 1
elif verbose:
print(" ⚠️ Rate limit reached, skipping alert")
date_str = datetime.now().strftime("%Y-%m-%d")
for result, score, reason, sentiment, sentiment_shift in medium_priority:
if not dry_run:
save_finding(topic_id, date_str, {
"result": result,
"score": score,
"reason": reason,
"timestamp": datetime.now().isoformat(),
"sentiment": sentiment,
"sentiment_shift": sentiment_shift,
})
if verbose:
print(f" 💾 Saved to digest: {result.get('title', '')[:50]}...")
if not dry_run:
topic_state = get_topic_state(state, topic_id)
topic_state["last_check"] = datetime.now().isoformat()
topic_state["last_results_count"] = len(results)
topic_state["findings_count"] = topic_state.get("findings_count", 0) + len(medium_priority)
def main():
parser = argparse.ArgumentParser(description="Monitor research topics")
parser.add_argument("--dry-run", action="store_true", help="Don't send alerts or save state")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--topic", help="Check specific topic by ID")
parser.add_argument("--force", action="store_true", help="Force check even if not due")
parser.add_argument("--frequency", choices=["hourly", "daily", "weekly"], help="Only check topics with this frequency")
parser.add_argument("--discover-feed", metavar="URL", help="Discover RSS/Atom feed links for a URL and exit")
args = parser.parse_args()
if args.discover_feed:
for item in discover_feed_urls(args.discover_feed):
print(item)
return
try:
config = load_config()
except FileNotFoundError as e:
print(f"{e}", file=sys.stderr)
sys.exit(1)
state = load_state()
settings = get_settings()
topics = config.get("topics", [])
if not topics:
print("⚠️ No topics configured", file=sys.stderr)
sys.exit(0)
topics_to_check = []
for topic in topics:
if args.topic and topic.get("id") != args.topic:
continue
if args.frequency and topic.get("frequency") != args.frequency:
continue
if should_check_topic(topic, state, force=args.force):
topics_to_check.append(topic)
if not topics_to_check:
if args.verbose:
print("✅ No topics due for checking")
sys.exit(0)
print(f"🔍 Monitoring {len(topics_to_check)} topic(s)...")
for topic in topics_to_check:
try:
monitor_topic(topic, state, settings, dry_run=args.dry_run, verbose=args.verbose)
except Exception as e:
print(f"❌ Error monitoring {topic.get('name')}: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
if not args.dry_run:
save_state(state)
print("✅ State saved")
print("✅ Monitoring complete")
if __name__ == "__main__":
main()