Initial commit with translated description

2026-03-29 13:06:50 +08:00
commit 673a554c43
6 changed files with 724 additions and 0 deletions
--- a/scripts/detect.py
+++ b/scripts/detect.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""Detect AI patterns in text based on Wikipedia's Signs of AI Writing."""
+import argparse, json, re, sys
+from pathlib import Path
+from dataclasses import dataclass, field
+
+SCRIPT_DIR = Path(__file__).parent
+PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
+
+@dataclass
+class DetectionResult:
+    significance_inflation: list = field(default_factory=list)
+    notability_emphasis: list = field(default_factory=list)
+    superficial_analysis: list = field(default_factory=list)
+    promotional_language: list = field(default_factory=list)
+    vague_attributions: list = field(default_factory=list)
+    challenges_formula: list = field(default_factory=list)
+    ai_vocabulary: list = field(default_factory=list)
+    copula_avoidance: list = field(default_factory=list)
+    filler_phrases: list = field(default_factory=list)
+    chatbot_artifacts: list = field(default_factory=list)
+    hedging_phrases: list = field(default_factory=list)
+    negative_parallelisms: list = field(default_factory=list)
+    rule_of_three: list = field(default_factory=list)
+    markdown_artifacts: list = field(default_factory=list)
+    citation_bugs: list = field(default_factory=list)
+    knowledge_cutoff: list = field(default_factory=list)
+    curly_quotes: int = 0
+    em_dashes: int = 0
+    total_issues: int = 0
+    ai_probability: str = "low"
+    word_count: int = 0
+
+def find_matches(text: str, patterns: list) -> list:
+    matches, lower = [], text.lower()
+    for p in patterns:
+        count = lower.count(p.lower())
+        if count > 0:
+            matches.append((p, count))
+    return sorted(matches, key=lambda x: -x[1])
+
+def detect(text: str) -> DetectionResult:
+    r = DetectionResult()
+    r.word_count = len(text.split())
+    r.significance_inflation = find_matches(text, PATTERNS["significance_inflation"])
+    r.notability_emphasis = find_matches(text, PATTERNS["notability_emphasis"])
+    r.superficial_analysis = find_matches(text, PATTERNS["superficial_analysis"])
+    r.promotional_language = find_matches(text, PATTERNS["promotional_language"])
+    r.vague_attributions = find_matches(text, PATTERNS["vague_attributions"])
+    r.challenges_formula = find_matches(text, PATTERNS["challenges_formula"])
+    r.ai_vocabulary = find_matches(text, PATTERNS["ai_vocabulary"])
+    r.copula_avoidance = find_matches(text, list(PATTERNS["copula_avoidance"].keys()))
+    r.filler_phrases = find_matches(text, list(PATTERNS["filler_replacements"].keys()))
+    r.chatbot_artifacts = find_matches(text, PATTERNS["chatbot_artifacts"])
+    r.hedging_phrases = find_matches(text, PATTERNS["hedging_phrases"])
+    r.negative_parallelisms = find_matches(text, PATTERNS["negative_parallelisms"])
+    r.rule_of_three = find_matches(text, PATTERNS["rule_of_three_patterns"])
+    r.markdown_artifacts = find_matches(text, PATTERNS["markdown_artifacts"])
+    r.citation_bugs = find_matches(text, PATTERNS["citation_bugs"])
+    r.knowledge_cutoff = find_matches(text, PATTERNS["knowledge_cutoff"])
+    r.curly_quotes = len(re.findall(r'[""'']', text))
+    r.em_dashes = text.count("—") + text.count(" -- ")
+    
+    r.total_issues = (
+        sum(c for _, c in r.significance_inflation) + sum(c for _, c in r.notability_emphasis) +
+        sum(c for _, c in r.superficial_analysis) + sum(c for _, c in r.promotional_language) +
+        sum(c for _, c in r.vague_attributions) + sum(c for _, c in r.challenges_formula) +
+        sum(c for _, c in r.ai_vocabulary) + sum(c for _, c in r.copula_avoidance) +
+        sum(c for _, c in r.filler_phrases) + sum(c for _, c in r.chatbot_artifacts) * 3 +
+        sum(c for _, c in r.hedging_phrases) + sum(c for _, c in r.negative_parallelisms) +
+        sum(c for _, c in r.markdown_artifacts) * 2 + sum(c for _, c in r.citation_bugs) * 5 +
+        sum(c for _, c in r.knowledge_cutoff) * 3 + r.curly_quotes + (r.em_dashes if r.em_dashes > 3 else 0)
+    )
+    
+    density = r.total_issues / max(r.word_count, 1) * 100
+    if r.citation_bugs or r.knowledge_cutoff or r.chatbot_artifacts:
+        r.ai_probability = "very high"
+    elif density > 5 or r.total_issues > 30:
+        r.ai_probability = "high"
+    elif density > 2 or r.total_issues > 15:
+        r.ai_probability = "medium"
+    return r
+
+def print_section(title: str, items: list, replacements: dict = None):
+    if not items:
+        return
+    print(f"{title}:")
+    for phrase, count in items:
+        if replacements and phrase in replacements:
+            repl = replacements[phrase]
+            arrow = f' → "{repl}"' if repl else " → (remove)"
+            print(f"  • \"{phrase}\"{arrow}: {count}x")
+        else:
+            print(f"  • {phrase}: {count}x")
+    print()
+
+def print_report(r: DetectionResult):
+    icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
+    print(f"\n{'='*60}")
+    print(f"AI DETECTION SCAN - {r.total_issues} issues ({r.word_count} words)")
+    print(f"AI Probability: {icons.get(r.ai_probability, '')} {r.ai_probability.upper()}")
+    print(f"{'='*60}\n")
+    
+    if r.citation_bugs:
+        print("⚠️  CRITICAL: CHATGPT CITATION BUGS")
+        print_section("Citation Artifacts", r.citation_bugs)
+    if r.knowledge_cutoff:
+        print("⚠️  CRITICAL: KNOWLEDGE CUTOFF PHRASES")
+        print_section("Cutoff Phrases", r.knowledge_cutoff)
+    if r.chatbot_artifacts:
+        print("⚠️  HIGH: CHATBOT ARTIFACTS")
+        print_section("Artifacts", r.chatbot_artifacts)
+    if r.markdown_artifacts:
+        print("⚠️  MARKDOWN DETECTED")
+        print_section("Markdown", r.markdown_artifacts)
+    
+    print_section("SIGNIFICANCE INFLATION", r.significance_inflation)
+    print_section("PROMOTIONAL LANGUAGE", r.promotional_language)
+    print_section("AI VOCABULARY", r.ai_vocabulary)
+    print_section("SUPERFICIAL -ING", r.superficial_analysis)
+    print_section("COPULA AVOIDANCE", r.copula_avoidance, PATTERNS["copula_avoidance"])
+    print_section("FILLER PHRASES", r.filler_phrases, PATTERNS["filler_replacements"])
+    print_section("VAGUE ATTRIBUTIONS", r.vague_attributions)
+    print_section("CHALLENGES FORMULA", r.challenges_formula)
+    print_section("HEDGING", r.hedging_phrases)
+    print_section("NEGATIVE PARALLELISMS", r.negative_parallelisms)
+    print_section("NOTABILITY EMPHASIS", r.notability_emphasis)
+    
+    if r.curly_quotes:
+        print(f"CURLY QUOTES: {r.curly_quotes} (ChatGPT signature)\n")
+    if r.em_dashes > 3:
+        print(f"EM DASHES: {r.em_dashes} (excessive)\n")
+    if r.total_issues == 0:
+        print("✓ No AI patterns detected.\n")
+
+def main():
+    parser = argparse.ArgumentParser(description="Detect AI patterns in text")
+    parser.add_argument("input", nargs="?", help="Input file (or stdin)")
+    parser.add_argument("--json", "-j", action="store_true", help="JSON output")
+    parser.add_argument("--score-only", "-s", action="store_true", help="Score and probability only")
+    args = parser.parse_args()
+    
+    text = Path(args.input).read_text() if args.input else sys.stdin.read()
+    result = detect(text)
+    
+    if args.json:
+        print(json.dumps({
+            "total_issues": result.total_issues, "word_count": result.word_count,
+            "ai_probability": result.ai_probability, "significance_inflation": result.significance_inflation,
+            "promotional_language": result.promotional_language, "ai_vocabulary": result.ai_vocabulary,
+            "chatbot_artifacts": result.chatbot_artifacts, "citation_bugs": result.citation_bugs,
+            "filler_phrases": result.filler_phrases, "curly_quotes": result.curly_quotes, "em_dashes": result.em_dashes,
+        }, indent=2))
+    elif args.score_only:
+        print(f"Issues: {result.total_issues} | Words: {result.word_count} | AI: {result.ai_probability}")
+    else:
+        print_report(result)
+
+if __name__ == "__main__":
+    main()