Files
kesslerio_finance-news/tests/test_ranking.py

71 lines
2.8 KiB
Python
Raw Normal View History

import sys
from pathlib import Path
import pytest
from datetime import datetime, timedelta
# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
from ranking import calculate_score, rank_headlines, classify_category
def test_classify_category():
assert "macro" in classify_category("Fed signals rate cut")
assert "equities" in classify_category("Apple earnings beat")
assert "energy" in classify_category("Oil prices surge")
assert "tech" in classify_category("AI chip demand remains high")
assert "geopolitics" in classify_category("US imposes new sanctions on Russia")
assert classify_category("Weather is nice") == ["general"]
def test_calculate_score_impact():
weights = {"market_impact": 0.4, "novelty": 0.2, "breadth": 0.2, "credibility": 0.1, "diversity": 0.1}
category_counts = {}
high_impact = {"title": "Fed announces emergency rate cut", "source": "Reuters", "published_at": datetime.now().isoformat()}
low_impact = {"title": "Local coffee shop opens", "source": "Blog", "published_at": datetime.now().isoformat()}
score_high = calculate_score(high_impact, weights, category_counts)
score_low = calculate_score(low_impact, weights, category_counts)
assert score_high > score_low
def test_rank_headlines_deduplication():
headlines = [
{"title": "Fed signals rate cut in March", "source": "WSJ"},
{"title": "FED SIGNALS RATE CUT IN MARCH!!!", "source": "Reuters"}, # Dupe
{"title": "Apple earnings are out", "source": "CNBC"}
]
result = rank_headlines(headlines)
# After dedupe, we should have 2 unique headlines
assert result["after_dedupe"] == 2
# must_read should contain the best ones
assert len(result["must_read"]) <= 2
def test_rank_headlines_sorting():
headlines = [
{"title": "Local news", "source": "SmallBlog", "description": "Nothing much"},
{"title": "FED EMERGENCY RATE CUT", "source": "Bloomberg", "description": "Huge market impact"},
{"title": "Nvidia Earnings Surprise", "source": "Reuters", "description": "AI demand surges"}
]
result = rank_headlines(headlines)
# FED should be first due to macro impact + credibility
assert "FED" in result["must_read"][0]["title"]
assert "Nvidia" in result["must_read"][1]["title"]
def test_source_cap():
# Test that we don't have too many items from the same source
headlines = [
{"title": f"Story {i}", "source": "Reuters"} for i in range(10)
]
# Default source cap is 2
result = rank_headlines(headlines)
reuters_in_must_read = [h for h in result["must_read"] if h["source"] == "Reuters"]
reuters_in_scan = [h for h in result["scan"] if h["source"] == "Reuters"]
assert len(reuters_in_must_read) + len(reuters_in_scan) <= 2