scripts/ranking.py

#!/usr/bin/env python3
"""
Deterministic Headline Ranking - Impact-based ranking policy.

Implements #53: Deterministic impact-based ranking for headline selection.

Scoring Rubric (weights):
- Market Impact (40%): CB decisions, earnings, sanctions, oil spikes
- Novelty (20%): New vs recycled news
- Breadth (20%): Sector-wide vs single-stock
- Credibility (10%): Source reliability
- Diversity Bonus (10%): Underrepresented categories

Output:
- MUST_READ: Top 5 stories
- SCAN: 3-5 additional stories (if quality threshold met)
"""

import re
from datetime import datetime
from difflib import SequenceMatcher


# Category keywords for classification
CATEGORY_KEYWORDS = {
    "macro": ["fed", "ecb", "boj", "central bank", "rate", "inflation", "gdp", "unemployment", "treasury", "yield", "bond"],
    "equities": ["earnings", "revenue", "profit", "eps", "guidance", "beat", "miss", "upgrade", "downgrade", "target"],
    "geopolitics": ["sanction", "tariff", "war", "conflict", "embargo", "trump", "china", "russia", "ukraine", "iran", "trade war"],
    "energy": ["oil", "opec", "crude", "gas", "energy", "brent", "wti"],
    "tech": ["ai", "chip", "semiconductor", "nvidia", "apple", "google", "microsoft", "meta", "amazon"],
}

# Source credibility scores (0-1)
SOURCE_CREDIBILITY = {
    "Wall Street Journal": 0.95,
    "WSJ": 0.95,
    "Bloomberg": 0.95,
    "Reuters": 0.90,
    "Financial Times": 0.90,
    "CNBC": 0.80,
    "Yahoo Finance": 0.70,
    "MarketWatch": 0.75,
    "Barron's": 0.85,
    "Seeking Alpha": 0.60,
    "Tagesschau": 0.85,
    "Handelsblatt": 0.80,
}

# Default config
DEFAULT_CONFIG = {
    "dedupe_threshold": 0.7,
    "must_read_count": 5,
    "scan_count": 5,
    "must_read_min_score": 0.4,
    "scan_min_score": 0.25,
    "source_cap": 2,
    "weights": {
        "market_impact": 0.40,
        "novelty": 0.20,
        "breadth": 0.20,
        "credibility": 0.10,
        "diversity": 0.10,
    },
}


def normalize_title(title: str) -> str:
    """Normalize title for comparison."""
    if not title:
        return ""
    cleaned = re.sub(r"[^a-z0-9\s]", " ", title.lower())
    tokens = cleaned.split()
    return " ".join(tokens)


def title_similarity(a: str, b: str) -> float:
    """Calculate title similarity using SequenceMatcher."""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, normalize_title(a), normalize_title(b)).ratio()


def deduplicate_headlines(headlines: list[dict], threshold: float = 0.7) -> list[dict]:
    """Remove duplicate headlines by title similarity."""
    if not headlines:
        return []
    
    unique = []
    for article in headlines:
        title = article.get("title", "")
        is_dupe = False
        for existing in unique:
            if title_similarity(title, existing.get("title", "")) > threshold:
                is_dupe = True
                break
        if not is_dupe:
            unique.append(article)
    
    return unique


def classify_category(title: str, description: str = "") -> list[str]:
    """Classify headline into categories based on keywords."""
    text = f"{title} {description}".lower()
    categories = []
    
    for category, keywords in CATEGORY_KEYWORDS.items():
        for keyword in keywords:
            if keyword in text:
                categories.append(category)
                break
    
    return categories if categories else ["general"]


def score_market_impact(title: str, description: str = "") -> float:
    """Score market impact (0-1)."""
    text = f"{title} {description}".lower()
    score = 0.3  # Base score
    
    # High impact indicators
    high_impact = ["fed", "rate cut", "rate hike", "earnings", "guidance", "sanctions", "war", "oil", "recession"]
    for term in high_impact:
        if term in text:
            score += 0.15
    
    # Medium impact
    medium_impact = ["profit", "revenue", "gdp", "inflation", "tariff", "merger", "acquisition"]
    for term in medium_impact:
        if term in text:
            score += 0.1
    
    return min(score, 1.0)


def score_novelty(article: dict) -> float:
    """Score novelty based on recency (0-1)."""
    published_at = article.get("published_at")
    if not published_at:
        return 0.5  # Unknown = medium
    
    try:
        if isinstance(published_at, str):
            pub_time = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
        else:
            pub_time = published_at
        
        hours_old = (datetime.now(pub_time.tzinfo) - pub_time).total_seconds() / 3600
        
        if hours_old < 2:
            return 1.0
        elif hours_old < 6:
            return 0.8
        elif hours_old < 12:
            return 0.6
        elif hours_old < 24:
            return 0.4
        else:
            return 0.2
    except Exception:
        return 0.5


def score_breadth(categories: list[str]) -> float:
    """Score breadth - sector-wide vs single-stock (0-1)."""
    # More categories = broader impact
    if "macro" in categories or "geopolitics" in categories:
        return 0.9
    if "energy" in categories:
        return 0.7
    if len(categories) > 1:
        return 0.6
    return 0.4


def score_credibility(source: str) -> float:
    """Score source credibility (0-1)."""
    return SOURCE_CREDIBILITY.get(source, 0.5)


def calculate_score(article: dict, weights: dict, category_counts: dict) -> float:
    """Calculate overall score for a headline."""
    title = article.get("title", "")
    description = article.get("description", "")
    source = article.get("source", "")
    categories = classify_category(title, description)
    article["_categories"] = categories  # Store for later use
    
    # Component scores
    impact = score_market_impact(title, description)
    novelty = score_novelty(article)
    breadth = score_breadth(categories)
    credibility = score_credibility(source)
    
    # Diversity bonus - boost underrepresented categories
    diversity = 0.0
    for cat in categories:
        if category_counts.get(cat, 0) < 1:
            diversity = 0.5
            break
        elif category_counts.get(cat, 0) < 2:
            diversity = 0.3
    
    # Weighted sum
    score = (
        impact * weights.get("market_impact", 0.4) +
        novelty * weights.get("novelty", 0.2) +
        breadth * weights.get("breadth", 0.2) +
        credibility * weights.get("credibility", 0.1) +
        diversity * weights.get("diversity", 0.1)
    )
    
    article["_score"] = round(score, 3)
    article["_impact"] = round(impact, 3)
    article["_novelty"] = round(novelty, 3)
    
    return score


def apply_source_cap(ranked: list[dict], cap: int = 2) -> list[dict]:
    """Apply source cap - max N items per outlet."""
    source_counts = {}
    result = []
    
    for article in ranked:
        source = article.get("source", "Unknown")
        if source_counts.get(source, 0) < cap:
            result.append(article)
            source_counts[source] = source_counts.get(source, 0) + 1
    
    return result


def ensure_diversity(selected: list[dict], candidates: list[dict], required: list[str]) -> list[dict]:
    """Ensure at least one headline from required categories if available."""
    result = list(selected)
    covered = set()
    
    for article in result:
        for cat in article.get("_categories", []):
            covered.add(cat)
    
    for req_cat in required:
        if req_cat not in covered:
            # Find candidate from this category
            for candidate in candidates:
                if candidate not in result and req_cat in candidate.get("_categories", []):
                    result.append(candidate)
                    covered.add(req_cat)
                    break
    
    return result


def rank_headlines(headlines: list[dict], config: dict | None = None) -> dict:
    """
    Rank headlines deterministically.
    
    Args:
        headlines: List of headline dicts with title, source, description, etc.
        config: Optional config overrides
    
    Returns:
        {"must_read": [...], "scan": [...]}
    """
    cfg = {**DEFAULT_CONFIG, **(config or {})}
    weights = cfg.get("weights", DEFAULT_CONFIG["weights"])
    
    if not headlines:
        return {"must_read": [], "scan": []}
    
    # Step 1: Deduplicate
    unique = deduplicate_headlines(headlines, cfg["dedupe_threshold"])
    
    # Step 2: Score all headlines
    category_counts = {}
    for article in unique:
        calculate_score(article, weights, category_counts)
        for cat in article.get("_categories", []):
            category_counts[cat] = category_counts.get(cat, 0) + 1
    
    # Step 3: Sort by score
    ranked = sorted(unique, key=lambda x: x.get("_score", 0), reverse=True)
    
    # Step 4: Apply source cap
    capped = apply_source_cap(ranked, cfg["source_cap"])
    
    # Step 5: Select must_read with diversity quota
    # Leave room for diversity additions by taking count-1 initially
    must_read_candidates = [a for a in capped if a.get("_score", 0) >= cfg["must_read_min_score"]]
    must_read_count = cfg["must_read_count"]
    must_read = must_read_candidates[:max(1, must_read_count - 2)]  # Reserve 2 slots for diversity
    must_read = ensure_diversity(must_read, capped, ["macro", "equities", "geopolitics"])
    must_read = must_read[:must_read_count]  # Final trim to exact count
    
    # Step 6: Select scan (additional items)
    scan_candidates = [a for a in capped if a not in must_read and a.get("_score", 0) >= cfg["scan_min_score"]]
    scan = scan_candidates[:cfg["scan_count"]]
    
    return {
        "must_read": must_read,
        "scan": scan,
        "total_processed": len(headlines),
        "after_dedupe": len(unique),
    }


if __name__ == "__main__":
    # Test with sample data
    test_headlines = [
        {"title": "Fed signals rate cut in March", "source": "WSJ", "description": "Federal Reserve hints at policy shift"},
        {"title": "Apple earnings beat expectations", "source": "CNBC", "description": "Revenue up 15%"},
        {"title": "Oil prices surge on OPEC cuts", "source": "Reuters", "description": "Brent crude hits $90"},
        {"title": "China-US trade tensions escalate", "source": "Bloomberg", "description": "New tariffs announced"},
        {"title": "Tech stocks rally on AI optimism", "source": "Yahoo Finance", "description": "Nvidia leads gains"},
        {"title": "Fed hints at rate reduction", "source": "MarketWatch", "description": "Same story as WSJ"},  # Dupe
    ]
    
    result = rank_headlines(test_headlines)
    print("MUST_READ:")
    for h in result["must_read"]:
        print(f"  [{h['_score']:.2f}] {h['title']} ({h['source']})")
    print("\nSCAN:")
    for h in result["scan"]:
        print(f"  [{h['_score']:.2f}] {h['title']} ({h['source']})")
Initial commit with translated description 2026-03-29 10:21:46 +08:00			`#!/usr/bin/env python3`
			`"""`
			`Deterministic Headline Ranking - Impact-based ranking policy.`

			`Implements #53: Deterministic impact-based ranking for headline selection.`

			`Scoring Rubric (weights):`
			`- Market Impact (40%): CB decisions, earnings, sanctions, oil spikes`
			`- Novelty (20%): New vs recycled news`
			`- Breadth (20%): Sector-wide vs single-stock`
			`- Credibility (10%): Source reliability`
			`- Diversity Bonus (10%): Underrepresented categories`

			`Output:`
			`- MUST_READ: Top 5 stories`
			`- SCAN: 3-5 additional stories (if quality threshold met)`
			`"""`

			`import re`
			`from datetime import datetime`
			`from difflib import SequenceMatcher`


			`# Category keywords for classification`
			`CATEGORY_KEYWORDS = {`
			`"macro": ["fed", "ecb", "boj", "central bank", "rate", "inflation", "gdp", "unemployment", "treasury", "yield", "bond"],`
			`"equities": ["earnings", "revenue", "profit", "eps", "guidance", "beat", "miss", "upgrade", "downgrade", "target"],`
			`"geopolitics": ["sanction", "tariff", "war", "conflict", "embargo", "trump", "china", "russia", "ukraine", "iran", "trade war"],`
			`"energy": ["oil", "opec", "crude", "gas", "energy", "brent", "wti"],`
			`"tech": ["ai", "chip", "semiconductor", "nvidia", "apple", "google", "microsoft", "meta", "amazon"],`
			`}`

			`# Source credibility scores (0-1)`
			`SOURCE_CREDIBILITY = {`
			`"Wall Street Journal": 0.95,`
			`"WSJ": 0.95,`
			`"Bloomberg": 0.95,`
			`"Reuters": 0.90,`
			`"Financial Times": 0.90,`
			`"CNBC": 0.80,`
			`"Yahoo Finance": 0.70,`
			`"MarketWatch": 0.75,`
			`"Barron's": 0.85,`
			`"Seeking Alpha": 0.60,`
			`"Tagesschau": 0.85,`
			`"Handelsblatt": 0.80,`
			`}`

			`# Default config`
			`DEFAULT_CONFIG = {`
			`"dedupe_threshold": 0.7,`
			`"must_read_count": 5,`
			`"scan_count": 5,`
			`"must_read_min_score": 0.4,`
			`"scan_min_score": 0.25,`
			`"source_cap": 2,`
			`"weights": {`
			`"market_impact": 0.40,`
			`"novelty": 0.20,`
			`"breadth": 0.20,`
			`"credibility": 0.10,`
			`"diversity": 0.10,`
			`},`
			`}`


			`def normalize_title(title: str) -> str:`
			`"""Normalize title for comparison."""`
			`if not title:`
			`return ""`
			`cleaned = re.sub(r"[^a-z0-9\s]", " ", title.lower())`
			`tokens = cleaned.split()`
			`return " ".join(tokens)`


			`def title_similarity(a: str, b: str) -> float:`
			`"""Calculate title similarity using SequenceMatcher."""`
			`if not a or not b:`
			`return 0.0`
			`return SequenceMatcher(None, normalize_title(a), normalize_title(b)).ratio()`


			`def deduplicate_headlines(headlines: list[dict], threshold: float = 0.7) -> list[dict]:`
			`"""Remove duplicate headlines by title similarity."""`
			`if not headlines:`
			`return []`

			`unique = []`
			`for article in headlines:`
			`title = article.get("title", "")`
			`is_dupe = False`
			`for existing in unique:`
			`if title_similarity(title, existing.get("title", "")) > threshold:`
			`is_dupe = True`
			`break`
			`if not is_dupe:`
			`unique.append(article)`

			`return unique`


			`def classify_category(title: str, description: str = "") -> list[str]:`
			`"""Classify headline into categories based on keywords."""`
			`text = f"{title} {description}".lower()`
			`categories = []`

			`for category, keywords in CATEGORY_KEYWORDS.items():`
			`for keyword in keywords:`
			`if keyword in text:`
			`categories.append(category)`
			`break`

			`return categories if categories else ["general"]`


			`def score_market_impact(title: str, description: str = "") -> float:`
			`"""Score market impact (0-1)."""`
			`text = f"{title} {description}".lower()`
			`score = 0.3 # Base score`

			`# High impact indicators`
			`high_impact = ["fed", "rate cut", "rate hike", "earnings", "guidance", "sanctions", "war", "oil", "recession"]`
			`for term in high_impact:`
			`if term in text:`
			`score += 0.15`

			`# Medium impact`
			`medium_impact = ["profit", "revenue", "gdp", "inflation", "tariff", "merger", "acquisition"]`
			`for term in medium_impact:`
			`if term in text:`
			`score += 0.1`

			`return min(score, 1.0)`


			`def score_novelty(article: dict) -> float:`
			`"""Score novelty based on recency (0-1)."""`
			`published_at = article.get("published_at")`
			`if not published_at:`
			`return 0.5 # Unknown = medium`

			`try:`
			`if isinstance(published_at, str):`
			`pub_time = datetime.fromisoformat(published_at.replace("Z", "+00:00"))`
			`else:`
			`pub_time = published_at`

			`hours_old = (datetime.now(pub_time.tzinfo) - pub_time).total_seconds() / 3600`

			`if hours_old < 2:`
			`return 1.0`
			`elif hours_old < 6:`
			`return 0.8`
			`elif hours_old < 12:`
			`return 0.6`
			`elif hours_old < 24:`
			`return 0.4`
			`else:`
			`return 0.2`
			`except Exception:`
			`return 0.5`


			`def score_breadth(categories: list[str]) -> float:`
			`"""Score breadth - sector-wide vs single-stock (0-1)."""`
			`# More categories = broader impact`
			`if "macro" in categories or "geopolitics" in categories:`
			`return 0.9`
			`if "energy" in categories:`
			`return 0.7`
			`if len(categories) > 1:`
			`return 0.6`
			`return 0.4`


			`def score_credibility(source: str) -> float:`
			`"""Score source credibility (0-1)."""`
			`return SOURCE_CREDIBILITY.get(source, 0.5)`


			`def calculate_score(article: dict, weights: dict, category_counts: dict) -> float:`
			`"""Calculate overall score for a headline."""`
			`title = article.get("title", "")`
			`description = article.get("description", "")`
			`source = article.get("source", "")`
			`categories = classify_category(title, description)`
			`article["_categories"] = categories # Store for later use`

			`# Component scores`
			`impact = score_market_impact(title, description)`
			`novelty = score_novelty(article)`
			`breadth = score_breadth(categories)`
			`credibility = score_credibility(source)`

			`# Diversity bonus - boost underrepresented categories`
			`diversity = 0.0`
			`for cat in categories:`
			`if category_counts.get(cat, 0) < 1:`
			`diversity = 0.5`
			`break`
			`elif category_counts.get(cat, 0) < 2:`
			`diversity = 0.3`

			`# Weighted sum`
			`score = (`
			`impact * weights.get("market_impact", 0.4) +`
			`novelty * weights.get("novelty", 0.2) +`
			`breadth * weights.get("breadth", 0.2) +`
			`credibility * weights.get("credibility", 0.1) +`
			`diversity * weights.get("diversity", 0.1)`
			`)`

			`article["_score"] = round(score, 3)`
			`article["_impact"] = round(impact, 3)`
			`article["_novelty"] = round(novelty, 3)`

			`return score`


			`def apply_source_cap(ranked: list[dict], cap: int = 2) -> list[dict]:`
			`"""Apply source cap - max N items per outlet."""`
			`source_counts = {}`
			`result = []`

			`for article in ranked:`
			`source = article.get("source", "Unknown")`
			`if source_counts.get(source, 0) < cap:`
			`result.append(article)`
			`source_counts[source] = source_counts.get(source, 0) + 1`

			`return result`


			`def ensure_diversity(selected: list[dict], candidates: list[dict], required: list[str]) -> list[dict]:`
			`"""Ensure at least one headline from required categories if available."""`
			`result = list(selected)`
			`covered = set()`

			`for article in result:`
			`for cat in article.get("_categories", []):`
			`covered.add(cat)`

			`for req_cat in required:`
			`if req_cat not in covered:`
			`# Find candidate from this category`
			`for candidate in candidates:`
			`if candidate not in result and req_cat in candidate.get("_categories", []):`
			`result.append(candidate)`
			`covered.add(req_cat)`
			`break`

			`return result`


			`def rank_headlines(headlines: list[dict], config: dict \| None = None) -> dict:`
			`"""`
			`Rank headlines deterministically.`

			`Args:`
			`headlines: List of headline dicts with title, source, description, etc.`
			`config: Optional config overrides`

			`Returns:`
			`{"must_read": [...], "scan": [...]}`
			`"""`
			`cfg = {DEFAULT_CONFIG, (config or {})}`
			`weights = cfg.get("weights", DEFAULT_CONFIG["weights"])`

			`if not headlines:`
			`return {"must_read": [], "scan": []}`

			`# Step 1: Deduplicate`
			`unique = deduplicate_headlines(headlines, cfg["dedupe_threshold"])`

			`# Step 2: Score all headlines`
			`category_counts = {}`
			`for article in unique:`
			`calculate_score(article, weights, category_counts)`
			`for cat in article.get("_categories", []):`
			`category_counts[cat] = category_counts.get(cat, 0) + 1`

			`# Step 3: Sort by score`
			`ranked = sorted(unique, key=lambda x: x.get("_score", 0), reverse=True)`

			`# Step 4: Apply source cap`
			`capped = apply_source_cap(ranked, cfg["source_cap"])`

			`# Step 5: Select must_read with diversity quota`
			`# Leave room for diversity additions by taking count-1 initially`
			`must_read_candidates = [a for a in capped if a.get("_score", 0) >= cfg["must_read_min_score"]]`
			`must_read_count = cfg["must_read_count"]`
			`must_read = must_read_candidates[:max(1, must_read_count - 2)] # Reserve 2 slots for diversity`
			`must_read = ensure_diversity(must_read, capped, ["macro", "equities", "geopolitics"])`
			`must_read = must_read[:must_read_count] # Final trim to exact count`

			`# Step 6: Select scan (additional items)`
			`scan_candidates = [a for a in capped if a not in must_read and a.get("_score", 0) >= cfg["scan_min_score"]]`
			`scan = scan_candidates[:cfg["scan_count"]]`

			`return {`
			`"must_read": must_read,`
			`"scan": scan,`
			`"total_processed": len(headlines),`
			`"after_dedupe": len(unique),`
			`}`


			`if __name__ == "__main__":`
			`# Test with sample data`
			`test_headlines = [`
			`{"title": "Fed signals rate cut in March", "source": "WSJ", "description": "Federal Reserve hints at policy shift"},`
			`{"title": "Apple earnings beat expectations", "source": "CNBC", "description": "Revenue up 15%"},`
			`{"title": "Oil prices surge on OPEC cuts", "source": "Reuters", "description": "Brent crude hits $90"},`
			`{"title": "China-US trade tensions escalate", "source": "Bloomberg", "description": "New tariffs announced"},`
			`{"title": "Tech stocks rally on AI optimism", "source": "Yahoo Finance", "description": "Nvidia leads gains"},`
			`{"title": "Fed hints at rate reduction", "source": "MarketWatch", "description": "Same story as WSJ"}, # Dupe`
			`]`

			`result = rank_headlines(test_headlines)`
			`print("MUST_READ:")`
			`for h in result["must_read"]:`
			`print(f" [{h['_score']:.2f}] {h['title']} ({h['source']})")`
			`print("\nSCAN:")`
			`for h in result["scan"]:`
			`print(f" [{h['_score']:.2f}] {h['title']} ({h['source']})")`