kesslerio_finance-news/tests/test_ranking.py

import sys
from pathlib import Path
import pytest
from datetime import datetime, timedelta

# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))

from ranking import calculate_score, rank_headlines, classify_category

def test_classify_category():
    assert "macro" in classify_category("Fed signals rate cut")
    assert "equities" in classify_category("Apple earnings beat")
    assert "energy" in classify_category("Oil prices surge")
    assert "tech" in classify_category("AI chip demand remains high")
    assert "geopolitics" in classify_category("US imposes new sanctions on Russia")
    assert classify_category("Weather is nice") == ["general"]

def test_calculate_score_impact():
    weights = {"market_impact": 0.4, "novelty": 0.2, "breadth": 0.2, "credibility": 0.1, "diversity": 0.1}
    category_counts = {}

    high_impact = {"title": "Fed announces emergency rate cut", "source": "Reuters", "published_at": datetime.now().isoformat()}
    low_impact = {"title": "Local coffee shop opens", "source": "Blog", "published_at": datetime.now().isoformat()}

    score_high = calculate_score(high_impact, weights, category_counts)
    score_low = calculate_score(low_impact, weights, category_counts)

    assert score_high > score_low

def test_rank_headlines_deduplication():
    headlines = [
        {"title": "Fed signals rate cut in March", "source": "WSJ"},
        {"title": "FED SIGNALS RATE CUT IN MARCH!!!", "source": "Reuters"}, # Dupe
        {"title": "Apple earnings are out", "source": "CNBC"}
    ]

    result = rank_headlines(headlines)

    # After dedupe, we should have 2 unique headlines
    assert result["after_dedupe"] == 2
    # must_read should contain the best ones
    assert len(result["must_read"]) <= 2

def test_rank_headlines_sorting():
    headlines = [
        {"title": "Local news", "source": "SmallBlog", "description": "Nothing much"},
        {"title": "FED EMERGENCY RATE CUT", "source": "Bloomberg", "description": "Huge market impact"},
        {"title": "Nvidia Earnings Surprise", "source": "Reuters", "description": "AI demand surges"}
    ]

    result = rank_headlines(headlines)

    # FED should be first due to macro impact + credibility
    assert "FED" in result["must_read"][0]["title"]
    assert "Nvidia" in result["must_read"][1]["title"]

def test_source_cap():
    # Test that we don't have too many items from the same source
    headlines = [
        {"title": f"Story {i}", "source": "Reuters"} for i in range(10)
    ]

    # Default source cap is 2
    result = rank_headlines(headlines)

    reuters_in_must_read = [h for h in result["must_read"] if h["source"] == "Reuters"]
    reuters_in_scan = [h for h in result["scan"] if h["source"] == "Reuters"]

    assert len(reuters_in_must_read) + len(reuters_in_scan) <= 2