tests/test_ranking.py

import sys
from pathlib import Path
import pytest
from datetime import datetime, timedelta

# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))

from ranking import calculate_score, rank_headlines, classify_category

def test_classify_category():
    assert "macro" in classify_category("Fed signals rate cut")
    assert "equities" in classify_category("Apple earnings beat")
    assert "energy" in classify_category("Oil prices surge")
    assert "tech" in classify_category("AI chip demand remains high")
    assert "geopolitics" in classify_category("US imposes new sanctions on Russia")
    assert classify_category("Weather is nice") == ["general"]

def test_calculate_score_impact():
    weights = {"market_impact": 0.4, "novelty": 0.2, "breadth": 0.2, "credibility": 0.1, "diversity": 0.1}
    category_counts = {}
    
    high_impact = {"title": "Fed announces emergency rate cut", "source": "Reuters", "published_at": datetime.now().isoformat()}
    low_impact = {"title": "Local coffee shop opens", "source": "Blog", "published_at": datetime.now().isoformat()}
    
    score_high = calculate_score(high_impact, weights, category_counts)
    score_low = calculate_score(low_impact, weights, category_counts)
    
    assert score_high > score_low

def test_rank_headlines_deduplication():
    headlines = [
        {"title": "Fed signals rate cut in March", "source": "WSJ"},
        {"title": "FED SIGNALS RATE CUT IN MARCH!!!", "source": "Reuters"}, # Dupe
        {"title": "Apple earnings are out", "source": "CNBC"}
    ]
    
    result = rank_headlines(headlines)
    
    # After dedupe, we should have 2 unique headlines
    assert result["after_dedupe"] == 2
    # must_read should contain the best ones
    assert len(result["must_read"]) <= 2

def test_rank_headlines_sorting():
    headlines = [
        {"title": "Local news", "source": "SmallBlog", "description": "Nothing much"},
        {"title": "FED EMERGENCY RATE CUT", "source": "Bloomberg", "description": "Huge market impact"},
        {"title": "Nvidia Earnings Surprise", "source": "Reuters", "description": "AI demand surges"}
    ]
    
    result = rank_headlines(headlines)
    
    # FED should be first due to macro impact + credibility
    assert "FED" in result["must_read"][0]["title"]
    assert "Nvidia" in result["must_read"][1]["title"]

def test_source_cap():
    # Test that we don't have too many items from the same source
    headlines = [
        {"title": f"Story {i}", "source": "Reuters"} for i in range(10)
    ]
    
    # Default source cap is 2
    result = rank_headlines(headlines)
    
    reuters_in_must_read = [h for h in result["must_read"] if h["source"] == "Reuters"]
    reuters_in_scan = [h for h in result["scan"] if h["source"] == "Reuters"]
    
    assert len(reuters_in_must_read) + len(reuters_in_scan) <= 2
Initial commit with translated description 2026-03-29 10:21:46 +08:00			`import sys`
			`from pathlib import Path`
			`import pytest`
			`from datetime import datetime, timedelta`

			`# Add scripts to path`
			`sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))`

			`from ranking import calculate_score, rank_headlines, classify_category`

			`def test_classify_category():`
			`assert "macro" in classify_category("Fed signals rate cut")`
			`assert "equities" in classify_category("Apple earnings beat")`
			`assert "energy" in classify_category("Oil prices surge")`
			`assert "tech" in classify_category("AI chip demand remains high")`
			`assert "geopolitics" in classify_category("US imposes new sanctions on Russia")`
			`assert classify_category("Weather is nice") == ["general"]`

			`def test_calculate_score_impact():`
			`weights = {"market_impact": 0.4, "novelty": 0.2, "breadth": 0.2, "credibility": 0.1, "diversity": 0.1}`
			`category_counts = {}`

			`high_impact = {"title": "Fed announces emergency rate cut", "source": "Reuters", "published_at": datetime.now().isoformat()}`
			`low_impact = {"title": "Local coffee shop opens", "source": "Blog", "published_at": datetime.now().isoformat()}`

			`score_high = calculate_score(high_impact, weights, category_counts)`
			`score_low = calculate_score(low_impact, weights, category_counts)`

			`assert score_high > score_low`

			`def test_rank_headlines_deduplication():`
			`headlines = [`
			`{"title": "Fed signals rate cut in March", "source": "WSJ"},`
			`{"title": "FED SIGNALS RATE CUT IN MARCH!!!", "source": "Reuters"}, # Dupe`
			`{"title": "Apple earnings are out", "source": "CNBC"}`
			`]`

			`result = rank_headlines(headlines)`

			`# After dedupe, we should have 2 unique headlines`
			`assert result["after_dedupe"] == 2`
			`# must_read should contain the best ones`
			`assert len(result["must_read"]) <= 2`

			`def test_rank_headlines_sorting():`
			`headlines = [`
			`{"title": "Local news", "source": "SmallBlog", "description": "Nothing much"},`
			`{"title": "FED EMERGENCY RATE CUT", "source": "Bloomberg", "description": "Huge market impact"},`
			`{"title": "Nvidia Earnings Surprise", "source": "Reuters", "description": "AI demand surges"}`
			`]`

			`result = rank_headlines(headlines)`

			`# FED should be first due to macro impact + credibility`
			`assert "FED" in result["must_read"][0]["title"]`
			`assert "Nvidia" in result["must_read"][1]["title"]`

			`def test_source_cap():`
			`# Test that we don't have too many items from the same source`
			`headlines = [`
			`{"title": f"Story {i}", "source": "Reuters"} for i in range(10)`
			`]`

			`# Default source cap is 2`
			`result = rank_headlines(headlines)`

			`reuters_in_must_read = [h for h in result["must_read"] if h["source"] == "Reuters"]`
			`reuters_in_scan = [h for h in result["scan"] if h["source"] == "Reuters"]`

			`assert len(reuters_in_must_read) + len(reuters_in_scan) <= 2`