Initial commit with translated description
This commit is contained in:
242
scripts/importance_scorer.py
Normal file
242
scripts/importance_scorer.py
Normal file
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI-powered importance scoring for research findings.
|
||||
|
||||
Scores findings as:
|
||||
- HIGH: Immediate alert
|
||||
- MEDIUM: Include in digest
|
||||
- LOW: Ignore
|
||||
|
||||
Also returns a coarse sentiment label for alert rendering and shift tracking.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class ImportanceScorer:
|
||||
"""Score research findings for importance."""
|
||||
|
||||
SENTIMENT_LABELS = ("positive", "negative", "neutral", "mixed")
|
||||
|
||||
def __init__(self, topic: Dict, settings: Dict):
|
||||
self.topic = topic
|
||||
self.settings = settings
|
||||
self.learning_enabled = settings.get("learning_enabled", False)
|
||||
|
||||
def score(self, result: Dict) -> Tuple[str, float, str, str]:
|
||||
"""
|
||||
Score a result.
|
||||
|
||||
Returns:
|
||||
(priority, score, reason, sentiment)
|
||||
"""
|
||||
title = result.get("title", "")
|
||||
snippet = result.get("snippet", "")
|
||||
url = result.get("url", "")
|
||||
published = result.get("published_date", "")
|
||||
content = f"{title} {snippet}".lower()
|
||||
|
||||
signals = []
|
||||
total_score = 0.0
|
||||
|
||||
keyword_score, keyword_reason = self._score_keywords(content)
|
||||
signals.append(("keyword_match", keyword_score, keyword_reason))
|
||||
total_score += keyword_score
|
||||
|
||||
freshness_score, freshness_reason = self._score_freshness(published)
|
||||
signals.append(("freshness", freshness_score, freshness_reason))
|
||||
total_score += freshness_score
|
||||
|
||||
source_score, source_reason = self._score_source(url)
|
||||
signals.append(("source_quality", source_score, source_reason))
|
||||
total_score += source_score
|
||||
|
||||
condition_score, condition_reason = self._score_conditions(content, title)
|
||||
signals.append(("alert_conditions", condition_score, condition_reason))
|
||||
total_score += condition_score
|
||||
|
||||
# Clamp into a sane range after penalties.
|
||||
total_score = max(0.0, min(1.0, total_score))
|
||||
|
||||
threshold = self.topic.get("importance_threshold", "medium")
|
||||
if threshold == "high":
|
||||
if total_score >= 0.8:
|
||||
priority = "high"
|
||||
elif total_score >= 0.5:
|
||||
priority = "medium"
|
||||
else:
|
||||
priority = "low"
|
||||
elif threshold == "medium":
|
||||
if total_score >= 0.6:
|
||||
priority = "high"
|
||||
elif total_score >= 0.3:
|
||||
priority = "medium"
|
||||
else:
|
||||
priority = "low"
|
||||
else:
|
||||
if total_score >= 0.4:
|
||||
priority = "high"
|
||||
elif total_score >= 0.1:
|
||||
priority = "medium"
|
||||
else:
|
||||
priority = "low"
|
||||
|
||||
top_signals = sorted(signals, key=lambda x: x[1], reverse=True)[:2]
|
||||
reason_parts = [s[2] for s in top_signals if s[2]]
|
||||
reason = " + ".join(reason_parts) if reason_parts else "low_relevance"
|
||||
sentiment = self._score_sentiment(title, snippet)
|
||||
|
||||
return priority, total_score, reason, sentiment
|
||||
|
||||
def _score_keywords(self, content: str) -> Tuple[float, str]:
|
||||
keywords = self.topic.get("keywords", [])
|
||||
if not keywords:
|
||||
return 0.0, ""
|
||||
|
||||
matches = 0
|
||||
exact_matches = 0
|
||||
|
||||
for keyword in keywords:
|
||||
keyword_lower = keyword.lower().strip()
|
||||
if not keyword_lower:
|
||||
continue
|
||||
|
||||
if keyword_lower.startswith("-"):
|
||||
negative_keyword = keyword_lower[1:]
|
||||
if negative_keyword and negative_keyword in content:
|
||||
return 0.0, f"contains_excluded_{negative_keyword}"
|
||||
continue
|
||||
|
||||
if re.search(r'\b' + re.escape(keyword_lower) + r'\b', content):
|
||||
exact_matches += 1
|
||||
matches += 1
|
||||
elif keyword_lower in content:
|
||||
matches += 1
|
||||
|
||||
if exact_matches >= 2:
|
||||
return 0.3, f"exact_match_{exact_matches}_keywords"
|
||||
if exact_matches == 1:
|
||||
return 0.2, "exact_match_1_keyword"
|
||||
if matches >= 2:
|
||||
return 0.15, f"partial_match_{matches}_keywords"
|
||||
if matches == 1:
|
||||
return 0.1, "partial_match_1_keyword"
|
||||
return 0.0, "no_keyword_match"
|
||||
|
||||
def _score_freshness(self, published: str) -> Tuple[float, str]:
|
||||
if not published:
|
||||
return 0.0, ""
|
||||
|
||||
try:
|
||||
if "T" in published:
|
||||
pub_date = datetime.fromisoformat(published.replace("Z", "+00:00"))
|
||||
else:
|
||||
pub_date = datetime.strptime(published, "%Y-%m-%d")
|
||||
|
||||
age = datetime.now() - pub_date.replace(tzinfo=None)
|
||||
if age < timedelta(hours=6):
|
||||
return 0.2, "very_fresh_<6h"
|
||||
if age < timedelta(days=1):
|
||||
return 0.15, "fresh_<24h"
|
||||
if age < timedelta(days=3):
|
||||
return 0.1, "recent_<3d"
|
||||
return 0.05, "older_>3d"
|
||||
except Exception:
|
||||
return 0.0, ""
|
||||
|
||||
def _score_source(self, url: str) -> Tuple[float, str]:
|
||||
boost_sources = self.topic.get("boost_sources", [])
|
||||
for source in boost_sources:
|
||||
if source and source in url:
|
||||
return 0.2, f"boosted_source_{source}"
|
||||
|
||||
ignore_sources = self.topic.get("ignore_sources", [])
|
||||
for source in ignore_sources:
|
||||
if source and source in url:
|
||||
return -1.0, f"ignored_source_{source}"
|
||||
|
||||
trusted = [
|
||||
"github.com",
|
||||
"arxiv.org",
|
||||
"news.ycombinator.com",
|
||||
"techcrunch.com",
|
||||
"theverge.com",
|
||||
"arstechnica.com",
|
||||
]
|
||||
for source in trusted:
|
||||
if source in url:
|
||||
return 0.15, f"trusted_source_{source}"
|
||||
|
||||
return 0.05, "standard_source"
|
||||
|
||||
def _score_conditions(self, content: str, title: str) -> Tuple[float, str]:
|
||||
alert_on = self.topic.get("alert_on", [])
|
||||
|
||||
for condition in alert_on:
|
||||
if condition == "price_change_10pct":
|
||||
if self._detect_price_change(content, threshold=0.10):
|
||||
return 0.3, "price_change_>10%"
|
||||
elif condition == "keyword_exact_match":
|
||||
for kw in self.topic.get("keywords", []):
|
||||
if kw and re.search(r'\b' + re.escape(kw.lower()) + r'\b', content):
|
||||
return 0.2, "exact_keyword_in_condition"
|
||||
elif condition == "major_paper":
|
||||
if "arxiv" in content or "paper" in title.lower():
|
||||
return 0.25, "academic_paper_detected"
|
||||
elif condition == "model_release":
|
||||
if re.search(r'(release|launch|announce).*\b(model|gpt|llm)\b', content, re.I):
|
||||
return 0.3, "model_release_detected"
|
||||
elif condition == "patch_release":
|
||||
if re.search(r'(patch|update|version|release).*\d+\.\d+', content, re.I):
|
||||
return 0.25, "patch_release_detected"
|
||||
elif condition == "major_bug_fix":
|
||||
if re.search(r'(fix|patch|solve).*(critical|major|bug)', content, re.I):
|
||||
return 0.2, "major_bug_fix_detected"
|
||||
elif condition == "github_release":
|
||||
if "/releases/tag/" in content or "release" in title.lower():
|
||||
return 0.25, "github_release_detected"
|
||||
|
||||
return 0.0, ""
|
||||
|
||||
def _detect_price_change(self, content: str, threshold: float = 0.10) -> bool:
|
||||
matches = re.findall(r'(\d+(?:\.\d+)?)\s*%', content)
|
||||
for match in matches:
|
||||
if float(match) >= threshold * 100:
|
||||
return True
|
||||
|
||||
for keyword in ["surge", "plunge", "jump", "drop", "spike", "crash"]:
|
||||
if keyword in content:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _score_sentiment(self, title: str, snippet: str) -> str:
|
||||
text = f"{title} {snippet}".lower()
|
||||
positive_terms = [
|
||||
"launch", "released", "release", "improved", "improvement", "wins",
|
||||
"record", "growth", "surge", "upgrade", "success", "stable",
|
||||
"available", "general availability", "fast", "faster", "secure",
|
||||
]
|
||||
negative_terms = [
|
||||
"breach", "incident", "critical", "severe", "failure", "fails",
|
||||
"outage", "downtime", "vulnerability", "cve", "warning", "recall",
|
||||
"delay", "delayed", "lawsuit", "drop", "crash", "exploit", "bug",
|
||||
]
|
||||
|
||||
pos = sum(1 for term in positive_terms if term in text)
|
||||
neg = sum(1 for term in negative_terms if term in text)
|
||||
|
||||
if pos and neg:
|
||||
return "mixed"
|
||||
if neg > 0:
|
||||
return "negative"
|
||||
if pos > 0:
|
||||
return "positive"
|
||||
return "neutral"
|
||||
|
||||
|
||||
def score_result(result: Dict, topic: Dict, settings: Dict) -> Tuple[str, float, str, str]:
|
||||
"""Convenience function for scoring without creating scorer instance."""
|
||||
scorer = ImportanceScorer(topic, settings)
|
||||
return scorer.score(result)
|
||||
Reference in New Issue
Block a user