Files
moltbro_humanize-ai-text/scripts/detect.py

161 lines
7.7 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""Detect AI patterns in text based on Wikipedia's Signs of AI Writing."""
import argparse, json, re, sys
from pathlib import Path
from dataclasses import dataclass, field
SCRIPT_DIR = Path(__file__).parent
PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
@dataclass
class DetectionResult:
significance_inflation: list = field(default_factory=list)
notability_emphasis: list = field(default_factory=list)
superficial_analysis: list = field(default_factory=list)
promotional_language: list = field(default_factory=list)
vague_attributions: list = field(default_factory=list)
challenges_formula: list = field(default_factory=list)
ai_vocabulary: list = field(default_factory=list)
copula_avoidance: list = field(default_factory=list)
filler_phrases: list = field(default_factory=list)
chatbot_artifacts: list = field(default_factory=list)
hedging_phrases: list = field(default_factory=list)
negative_parallelisms: list = field(default_factory=list)
rule_of_three: list = field(default_factory=list)
markdown_artifacts: list = field(default_factory=list)
citation_bugs: list = field(default_factory=list)
knowledge_cutoff: list = field(default_factory=list)
curly_quotes: int = 0
em_dashes: int = 0
total_issues: int = 0
ai_probability: str = "low"
word_count: int = 0
def find_matches(text: str, patterns: list) -> list:
matches, lower = [], text.lower()
for p in patterns:
count = lower.count(p.lower())
if count > 0:
matches.append((p, count))
return sorted(matches, key=lambda x: -x[1])
def detect(text: str) -> DetectionResult:
r = DetectionResult()
r.word_count = len(text.split())
r.significance_inflation = find_matches(text, PATTERNS["significance_inflation"])
r.notability_emphasis = find_matches(text, PATTERNS["notability_emphasis"])
r.superficial_analysis = find_matches(text, PATTERNS["superficial_analysis"])
r.promotional_language = find_matches(text, PATTERNS["promotional_language"])
r.vague_attributions = find_matches(text, PATTERNS["vague_attributions"])
r.challenges_formula = find_matches(text, PATTERNS["challenges_formula"])
r.ai_vocabulary = find_matches(text, PATTERNS["ai_vocabulary"])
r.copula_avoidance = find_matches(text, list(PATTERNS["copula_avoidance"].keys()))
r.filler_phrases = find_matches(text, list(PATTERNS["filler_replacements"].keys()))
r.chatbot_artifacts = find_matches(text, PATTERNS["chatbot_artifacts"])
r.hedging_phrases = find_matches(text, PATTERNS["hedging_phrases"])
r.negative_parallelisms = find_matches(text, PATTERNS["negative_parallelisms"])
r.rule_of_three = find_matches(text, PATTERNS["rule_of_three_patterns"])
r.markdown_artifacts = find_matches(text, PATTERNS["markdown_artifacts"])
r.citation_bugs = find_matches(text, PATTERNS["citation_bugs"])
r.knowledge_cutoff = find_matches(text, PATTERNS["knowledge_cutoff"])
r.curly_quotes = len(re.findall(r'[""'']', text))
r.em_dashes = text.count("") + text.count(" -- ")
r.total_issues = (
sum(c for _, c in r.significance_inflation) + sum(c for _, c in r.notability_emphasis) +
sum(c for _, c in r.superficial_analysis) + sum(c for _, c in r.promotional_language) +
sum(c for _, c in r.vague_attributions) + sum(c for _, c in r.challenges_formula) +
sum(c for _, c in r.ai_vocabulary) + sum(c for _, c in r.copula_avoidance) +
sum(c for _, c in r.filler_phrases) + sum(c for _, c in r.chatbot_artifacts) * 3 +
sum(c for _, c in r.hedging_phrases) + sum(c for _, c in r.negative_parallelisms) +
sum(c for _, c in r.markdown_artifacts) * 2 + sum(c for _, c in r.citation_bugs) * 5 +
sum(c for _, c in r.knowledge_cutoff) * 3 + r.curly_quotes + (r.em_dashes if r.em_dashes > 3 else 0)
)
density = r.total_issues / max(r.word_count, 1) * 100
if r.citation_bugs or r.knowledge_cutoff or r.chatbot_artifacts:
r.ai_probability = "very high"
elif density > 5 or r.total_issues > 30:
r.ai_probability = "high"
elif density > 2 or r.total_issues > 15:
r.ai_probability = "medium"
return r
def print_section(title: str, items: list, replacements: dict = None):
if not items:
return
print(f"{title}:")
for phrase, count in items:
if replacements and phrase in replacements:
repl = replacements[phrase]
arrow = f'"{repl}"' if repl else " → (remove)"
print(f"\"{phrase}\"{arrow}: {count}x")
else:
print(f"{phrase}: {count}x")
print()
def print_report(r: DetectionResult):
icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
print(f"\n{'='*60}")
print(f"AI DETECTION SCAN - {r.total_issues} issues ({r.word_count} words)")
print(f"AI Probability: {icons.get(r.ai_probability, '')} {r.ai_probability.upper()}")
print(f"{'='*60}\n")
if r.citation_bugs:
print("⚠️ CRITICAL: CHATGPT CITATION BUGS")
print_section("Citation Artifacts", r.citation_bugs)
if r.knowledge_cutoff:
print("⚠️ CRITICAL: KNOWLEDGE CUTOFF PHRASES")
print_section("Cutoff Phrases", r.knowledge_cutoff)
if r.chatbot_artifacts:
print("⚠️ HIGH: CHATBOT ARTIFACTS")
print_section("Artifacts", r.chatbot_artifacts)
if r.markdown_artifacts:
print("⚠️ MARKDOWN DETECTED")
print_section("Markdown", r.markdown_artifacts)
print_section("SIGNIFICANCE INFLATION", r.significance_inflation)
print_section("PROMOTIONAL LANGUAGE", r.promotional_language)
print_section("AI VOCABULARY", r.ai_vocabulary)
print_section("SUPERFICIAL -ING", r.superficial_analysis)
print_section("COPULA AVOIDANCE", r.copula_avoidance, PATTERNS["copula_avoidance"])
print_section("FILLER PHRASES", r.filler_phrases, PATTERNS["filler_replacements"])
print_section("VAGUE ATTRIBUTIONS", r.vague_attributions)
print_section("CHALLENGES FORMULA", r.challenges_formula)
print_section("HEDGING", r.hedging_phrases)
print_section("NEGATIVE PARALLELISMS", r.negative_parallelisms)
print_section("NOTABILITY EMPHASIS", r.notability_emphasis)
if r.curly_quotes:
print(f"CURLY QUOTES: {r.curly_quotes} (ChatGPT signature)\n")
if r.em_dashes > 3:
print(f"EM DASHES: {r.em_dashes} (excessive)\n")
if r.total_issues == 0:
print("✓ No AI patterns detected.\n")
def main():
parser = argparse.ArgumentParser(description="Detect AI patterns in text")
parser.add_argument("input", nargs="?", help="Input file (or stdin)")
parser.add_argument("--json", "-j", action="store_true", help="JSON output")
parser.add_argument("--score-only", "-s", action="store_true", help="Score and probability only")
args = parser.parse_args()
text = Path(args.input).read_text() if args.input else sys.stdin.read()
result = detect(text)
if args.json:
print(json.dumps({
"total_issues": result.total_issues, "word_count": result.word_count,
"ai_probability": result.ai_probability, "significance_inflation": result.significance_inflation,
"promotional_language": result.promotional_language, "ai_vocabulary": result.ai_vocabulary,
"chatbot_artifacts": result.chatbot_artifacts, "citation_bugs": result.citation_bugs,
"filler_phrases": result.filler_phrases, "curly_quotes": result.curly_quotes, "em_dashes": result.em_dashes,
}, indent=2))
elif args.score_only:
print(f"Issues: {result.total_issues} | Words: {result.word_count} | AI: {result.ai_probability}")
else:
print_report(result)
if __name__ == "__main__":
main()