Initial commit with translated description

2026-03-29 13:06:50 +08:00
commit 673a554c43
6 changed files with 724 additions and 0 deletions
--- a/scripts/compare.py
+++ b/scripts/compare.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Compare before/after transformation with side-by-side detection scores."""
+import argparse, sys
+from pathlib import Path
+from detect import detect
+from transform import transform
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare AI detection before/after transformation")
+    parser.add_argument("input", nargs="?", help="Input file (or stdin)")
+    parser.add_argument("-a", "--aggressive", action="store_true", help="Use aggressive mode")
+    parser.add_argument("-o", "--output", help="Save transformed text to file")
+    args = parser.parse_args()
+    
+    text = Path(args.input).read_text() if args.input else sys.stdin.read()
+    
+    before = detect(text)
+    transformed, changes = transform(text, aggressive=args.aggressive)
+    after = detect(transformed)
+    
+    icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
+    
+    print(f"\n{'='*60}")
+    print("BEFORE → AFTER COMPARISON")
+    print(f"{'='*60}\n")
+    
+    print(f"{'Metric':<25} {'Before':<15} {'After':<15} {'Change':<10}")
+    print(f"{'-'*60}")
+    
+    issue_diff = after.total_issues - before.total_issues
+    issue_sign = "+" if issue_diff > 0 else ""
+    print(f"{'Issues':<25} {before.total_issues:<15} {after.total_issues:<15} {issue_sign}{issue_diff}")
+    
+    print(f"{'AI Probability':<25} {icons.get(before.ai_probability,'')} {before.ai_probability:<12} {icons.get(after.ai_probability,'')} {after.ai_probability:<12}")
+    print(f"{'Word Count':<25} {before.word_count:<15} {after.word_count:<15} {after.word_count - before.word_count:+}")
+    
+    if changes:
+        print(f"\n{'='*60}")
+        print(f"TRANSFORMATIONS ({len(changes)})")
+        print(f"{'='*60}")
+        for c in changes:
+            print(f"  • {c}")
+    
+    reduction = before.total_issues - after.total_issues
+    if reduction > 0:
+        pct = (reduction / before.total_issues * 100) if before.total_issues else 0
+        print(f"\n✓ Reduced {reduction} issues ({pct:.0f}% improvement)")
+    elif reduction < 0:
+        print(f"\n⚠ Issues increased by {-reduction}")
+    else:
+        print(f"\n— No change in issue count")
+    
+    if args.output:
+        Path(args.output).write_text(transformed)
+        print(f"\n→ Saved to {args.output}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/detect.py
+++ b/scripts/detect.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""Detect AI patterns in text based on Wikipedia's Signs of AI Writing."""
+import argparse, json, re, sys
+from pathlib import Path
+from dataclasses import dataclass, field
+
+SCRIPT_DIR = Path(__file__).parent
+PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
+
+@dataclass
+class DetectionResult:
+    significance_inflation: list = field(default_factory=list)
+    notability_emphasis: list = field(default_factory=list)
+    superficial_analysis: list = field(default_factory=list)
+    promotional_language: list = field(default_factory=list)
+    vague_attributions: list = field(default_factory=list)
+    challenges_formula: list = field(default_factory=list)
+    ai_vocabulary: list = field(default_factory=list)
+    copula_avoidance: list = field(default_factory=list)
+    filler_phrases: list = field(default_factory=list)
+    chatbot_artifacts: list = field(default_factory=list)
+    hedging_phrases: list = field(default_factory=list)
+    negative_parallelisms: list = field(default_factory=list)
+    rule_of_three: list = field(default_factory=list)
+    markdown_artifacts: list = field(default_factory=list)
+    citation_bugs: list = field(default_factory=list)
+    knowledge_cutoff: list = field(default_factory=list)
+    curly_quotes: int = 0
+    em_dashes: int = 0
+    total_issues: int = 0
+    ai_probability: str = "low"
+    word_count: int = 0
+
+def find_matches(text: str, patterns: list) -> list:
+    matches, lower = [], text.lower()
+    for p in patterns:
+        count = lower.count(p.lower())
+        if count > 0:
+            matches.append((p, count))
+    return sorted(matches, key=lambda x: -x[1])
+
+def detect(text: str) -> DetectionResult:
+    r = DetectionResult()
+    r.word_count = len(text.split())
+    r.significance_inflation = find_matches(text, PATTERNS["significance_inflation"])
+    r.notability_emphasis = find_matches(text, PATTERNS["notability_emphasis"])
+    r.superficial_analysis = find_matches(text, PATTERNS["superficial_analysis"])
+    r.promotional_language = find_matches(text, PATTERNS["promotional_language"])
+    r.vague_attributions = find_matches(text, PATTERNS["vague_attributions"])
+    r.challenges_formula = find_matches(text, PATTERNS["challenges_formula"])
+    r.ai_vocabulary = find_matches(text, PATTERNS["ai_vocabulary"])
+    r.copula_avoidance = find_matches(text, list(PATTERNS["copula_avoidance"].keys()))
+    r.filler_phrases = find_matches(text, list(PATTERNS["filler_replacements"].keys()))
+    r.chatbot_artifacts = find_matches(text, PATTERNS["chatbot_artifacts"])
+    r.hedging_phrases = find_matches(text, PATTERNS["hedging_phrases"])
+    r.negative_parallelisms = find_matches(text, PATTERNS["negative_parallelisms"])
+    r.rule_of_three = find_matches(text, PATTERNS["rule_of_three_patterns"])
+    r.markdown_artifacts = find_matches(text, PATTERNS["markdown_artifacts"])
+    r.citation_bugs = find_matches(text, PATTERNS["citation_bugs"])
+    r.knowledge_cutoff = find_matches(text, PATTERNS["knowledge_cutoff"])
+    r.curly_quotes = len(re.findall(r'[""'']', text))
+    r.em_dashes = text.count("—") + text.count(" -- ")
+    
+    r.total_issues = (
+        sum(c for _, c in r.significance_inflation) + sum(c for _, c in r.notability_emphasis) +
+        sum(c for _, c in r.superficial_analysis) + sum(c for _, c in r.promotional_language) +
+        sum(c for _, c in r.vague_attributions) + sum(c for _, c in r.challenges_formula) +
+        sum(c for _, c in r.ai_vocabulary) + sum(c for _, c in r.copula_avoidance) +
+        sum(c for _, c in r.filler_phrases) + sum(c for _, c in r.chatbot_artifacts) * 3 +
+        sum(c for _, c in r.hedging_phrases) + sum(c for _, c in r.negative_parallelisms) +
+        sum(c for _, c in r.markdown_artifacts) * 2 + sum(c for _, c in r.citation_bugs) * 5 +
+        sum(c for _, c in r.knowledge_cutoff) * 3 + r.curly_quotes + (r.em_dashes if r.em_dashes > 3 else 0)
+    )
+    
+    density = r.total_issues / max(r.word_count, 1) * 100
+    if r.citation_bugs or r.knowledge_cutoff or r.chatbot_artifacts:
+        r.ai_probability = "very high"
+    elif density > 5 or r.total_issues > 30:
+        r.ai_probability = "high"
+    elif density > 2 or r.total_issues > 15:
+        r.ai_probability = "medium"
+    return r
+
+def print_section(title: str, items: list, replacements: dict = None):
+    if not items:
+        return
+    print(f"{title}:")
+    for phrase, count in items:
+        if replacements and phrase in replacements:
+            repl = replacements[phrase]
+            arrow = f' → "{repl}"' if repl else " → (remove)"
+            print(f"  • \"{phrase}\"{arrow}: {count}x")
+        else:
+            print(f"  • {phrase}: {count}x")
+    print()
+
+def print_report(r: DetectionResult):
+    icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
+    print(f"\n{'='*60}")
+    print(f"AI DETECTION SCAN - {r.total_issues} issues ({r.word_count} words)")
+    print(f"AI Probability: {icons.get(r.ai_probability, '')} {r.ai_probability.upper()}")
+    print(f"{'='*60}\n")
+    
+    if r.citation_bugs:
+        print("⚠️  CRITICAL: CHATGPT CITATION BUGS")
+        print_section("Citation Artifacts", r.citation_bugs)
+    if r.knowledge_cutoff:
+        print("⚠️  CRITICAL: KNOWLEDGE CUTOFF PHRASES")
+        print_section("Cutoff Phrases", r.knowledge_cutoff)
+    if r.chatbot_artifacts:
+        print("⚠️  HIGH: CHATBOT ARTIFACTS")
+        print_section("Artifacts", r.chatbot_artifacts)
+    if r.markdown_artifacts:
+        print("⚠️  MARKDOWN DETECTED")
+        print_section("Markdown", r.markdown_artifacts)
+    
+    print_section("SIGNIFICANCE INFLATION", r.significance_inflation)
+    print_section("PROMOTIONAL LANGUAGE", r.promotional_language)
+    print_section("AI VOCABULARY", r.ai_vocabulary)
+    print_section("SUPERFICIAL -ING", r.superficial_analysis)
+    print_section("COPULA AVOIDANCE", r.copula_avoidance, PATTERNS["copula_avoidance"])
+    print_section("FILLER PHRASES", r.filler_phrases, PATTERNS["filler_replacements"])
+    print_section("VAGUE ATTRIBUTIONS", r.vague_attributions)
+    print_section("CHALLENGES FORMULA", r.challenges_formula)
+    print_section("HEDGING", r.hedging_phrases)
+    print_section("NEGATIVE PARALLELISMS", r.negative_parallelisms)
+    print_section("NOTABILITY EMPHASIS", r.notability_emphasis)
+    
+    if r.curly_quotes:
+        print(f"CURLY QUOTES: {r.curly_quotes} (ChatGPT signature)\n")
+    if r.em_dashes > 3:
+        print(f"EM DASHES: {r.em_dashes} (excessive)\n")
+    if r.total_issues == 0:
+        print("✓ No AI patterns detected.\n")
+
+def main():
+    parser = argparse.ArgumentParser(description="Detect AI patterns in text")
+    parser.add_argument("input", nargs="?", help="Input file (or stdin)")
+    parser.add_argument("--json", "-j", action="store_true", help="JSON output")
+    parser.add_argument("--score-only", "-s", action="store_true", help="Score and probability only")
+    args = parser.parse_args()
+    
+    text = Path(args.input).read_text() if args.input else sys.stdin.read()
+    result = detect(text)
+    
+    if args.json:
+        print(json.dumps({
+            "total_issues": result.total_issues, "word_count": result.word_count,
+            "ai_probability": result.ai_probability, "significance_inflation": result.significance_inflation,
+            "promotional_language": result.promotional_language, "ai_vocabulary": result.ai_vocabulary,
+            "chatbot_artifacts": result.chatbot_artifacts, "citation_bugs": result.citation_bugs,
+            "filler_phrases": result.filler_phrases, "curly_quotes": result.curly_quotes, "em_dashes": result.em_dashes,
+        }, indent=2))
+    elif args.score_only:
+        print(f"Issues: {result.total_issues} | Words: {result.word_count} | AI: {result.ai_probability}")
+    else:
+        print_report(result)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/patterns.json
+++ b/scripts/patterns.json
@@ -0,0 +1,191 @@
+{
+  "significance_inflation": [
+    "stands as", "serves as", "is a testament", "is a reminder",
+    "vital role", "significant role", "crucial role", "pivotal role",
+    "key role", "pivotal moment", "key moment", "key turning point",
+    "underscores its importance", "highlights its importance",
+    "underscores its significance", "highlights its significance",
+    "reflects broader", "symbolizing its ongoing", "symbolizing its enduring",
+    "symbolizing its lasting", "contributing to the", "setting the stage for",
+    "marking the", "shaping the", "represents a shift", "marks a shift",
+    "evolving landscape", "focal point", "indelible mark", "deeply rooted",
+    "enduring legacy", "rich tapestry", "broader movement"
+  ],
+  "notability_emphasis": [
+    "independent coverage", "local media outlets", "regional media outlets",
+    "national media outlets", "music outlets", "business outlets", "tech outlets",
+    "profiled in", "written by a leading expert", "active social media presence",
+    "has been featured in", "has been cited in", "maintains a strong digital presence"
+  ],
+  "superficial_analysis": [
+    "highlighting", "underscoring", "emphasizing", "ensuring",
+    "reflecting", "symbolizing", "contributing to", "cultivating",
+    "fostering", "encompassing", "showcasing", "valuable insights",
+    "align with", "aligns with", "resonate with", "resonates with"
+  ],
+  "promotional_language": [
+    "boasts a", "boasts an", "vibrant", "rich cultural heritage",
+    "profound", "enhancing its", "exemplifies", "commitment to",
+    "natural beauty", "nestled", "in the heart of", "groundbreaking",
+    "renowned", "breathtaking", "must-visit", "stunning", "bustling",
+    "game-changing", "cutting-edge", "state-of-the-art", "world-class",
+    "best-in-class", "industry-leading", "innovative", "revolutionary"
+  ],
+  "vague_attributions": [
+    "industry reports", "observers have cited", "experts argue",
+    "experts believe", "some critics argue", "several sources",
+    "several publications", "according to experts", "widely regarded",
+    "it is widely believed", "many believe", "some would say"
+  ],
+  "challenges_formula": [
+    "despite its", "faces several challenges", "despite these challenges",
+    "challenges and legacy", "future outlook", "future prospects",
+    "looking ahead", "moving forward", "going forward"
+  ],
+  "ai_vocabulary": [
+    "additionally", "align with", "crucial", "delve", "emphasizing",
+    "enduring", "enhance", "fostering", "garner", "highlight",
+    "interplay", "intricate", "intricacies", "key", "landscape",
+    "pivotal", "showcase", "showcasing", "tapestry", "testament",
+    "underscore", "underscores", "valuable", "vibrant", "nuanced",
+    "multifaceted", "paradigm", "synergy", "realm", "underpins",
+    "unraveling", "unveiling", "leveraging", "furthermore", "moreover",
+    "consequently", "subsequently", "henceforth", "thereby", "wherein",
+    "thereof", "whatsoever", "nevertheless", "notwithstanding"
+  ],
+  "copula_avoidance": {
+    "serves as a": "is a",
+    "serves as an": "is an",
+    "serves as the": "is the",
+    "stands as a": "is a",
+    "stands as an": "is an",
+    "stands as the": "is the",
+    "marks a": "is a",
+    "marks an": "is an",
+    "marks the": "is the",
+    "represents a": "is a",
+    "represents an": "is an",
+    "represents the": "is the",
+    "boasts a": "has a",
+    "boasts an": "has an",
+    "boasts the": "has the",
+    "features a": "has a",
+    "features an": "has an",
+    "features the": "has the",
+    "offers a": "has a",
+    "offers an": "has an"
+  },
+  "filler_replacements": {
+    "in order to": "to",
+    "due to the fact that": "because",
+    "at this point in time": "now",
+    "at the present time": "now",
+    "has the ability to": "can",
+    "it is important to note that": "",
+    "it should be noted that": "",
+    "it is worth noting that": "",
+    "it is crucial to note that": "",
+    "it is critical to remember that": "",
+    "it goes without saying that": "",
+    "needless to say": "",
+    "Additionally,": "",
+    "Furthermore,": "",
+    "Moreover,": "",
+    "In conclusion,": "",
+    "To summarize,": "",
+    "In summary,": "",
+    "Overall,": "",
+    "utilize": "use",
+    "utilizes": "uses",
+    "utilizing": "using",
+    "utilization": "use",
+    "leverage": "use",
+    "leverages": "uses",
+    "leveraging": "using",
+    "facilitate": "help",
+    "facilitates": "helps",
+    "facilitating": "helping",
+    "implement": "add",
+    "implements": "adds",
+    "prioritize": "focus on",
+    "prioritizes": "focuses on",
+    "optimize": "improve",
+    "optimizes": "improves",
+    "streamline": "simplify",
+    "streamlines": "simplifies"
+  },
+  "chatbot_artifacts": [
+    "I hope this helps",
+    "Let me know if",
+    "Would you like me to",
+    "Great question",
+    "Excellent question",
+    "You're absolutely right",
+    "That's a great point",
+    "That's an excellent point",
+    "Certainly!",
+    "Of course!",
+    "Absolutely!",
+    "Happy to help",
+    "I'd be happy to",
+    "Feel free to",
+    "Don't hesitate to",
+    "Here is a",
+    "Here's a",
+    "I can help you with",
+    "As an AI",
+    "As a language model",
+    "As an AI language model"
+  ],
+  "hedging_phrases": [
+    "it could potentially",
+    "it might possibly",
+    "arguably",
+    "it could be argued that",
+    "some would say",
+    "in some ways",
+    "to some extent",
+    "in certain respects",
+    "may vary",
+    "results may vary"
+  ],
+  "negative_parallelisms": [
+    "not only", "but also",
+    "it's not just about", "it's about",
+    "it is not merely", "it is",
+    "not just", "but",
+    "no longer", "instead"
+  ],
+  "rule_of_three_patterns": [
+    "innovation, inspiration, and",
+    "engage, educate, and",
+    "plan, execute, and",
+    "design, develop, and",
+    "research, develop, and",
+    "create, collaborate, and",
+    "learn, grow, and"
+  ],
+  "markdown_artifacts": [
+    "**", "##", "###", "```",
+    "* **", "- **", "1. **"
+  ],
+  "citation_bugs": [
+    "oaicite", "oai_citation", "contentReference",
+    "turn0search", "turn0image", "utm_source=chatgpt",
+    "utm_source=openai", "attached_file", "grok_card"
+  ],
+  "knowledge_cutoff": [
+    "as of my last", "as of my knowledge",
+    "up to my last training", "based on available information",
+    "while specific details are limited",
+    "not widely available", "not widely documented",
+    "in the provided sources", "in available sources"
+  ],
+  "curly_quotes": {
+    "\u201c": "\"",
+    "\u201d": "\"",
+    "\u2018": "'",
+    "\u2019": "'"
+  },
+  "em_dash_patterns": ["—", " — ", "-- "]
+}
--- a/scripts/transform.py
+++ b/scripts/transform.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""Transform AI text to bypass detection."""
+import argparse, json, re, sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
+
+def replace_bounded(text: str, old: str, new: str) -> tuple[str, int]:
+    pattern = re.compile(re.escape(old), re.IGNORECASE) if " " in old or old.endswith(",") else re.compile(r"\b" + re.escape(old) + r"\b", re.IGNORECASE)
+    matches = pattern.findall(text)
+    return pattern.sub(new, text) if matches else text, len(matches)
+
+def apply_replacements(text: str, replacements: dict) -> tuple[str, list]:
+    changes = []
+    for old, new in replacements.items():
+        text, count = replace_bounded(text, old, new)
+        if count:
+            changes.append(f'"{old}" → "{new}"' if new else f'"{old}" removed')
+    return text, changes
+
+def fix_quotes(text: str) -> tuple[str, bool]:
+    original = text
+    for old, new in PATTERNS["curly_quotes"].items():
+        text = text.replace(old, new)
+    return text, text != original
+
+def remove_chatbot_sentences(text: str) -> tuple[str, list]:
+    changes = []
+    for artifact in PATTERNS["chatbot_artifacts"]:
+        pattern = re.compile(r"[^.!?\n]*" + re.escape(artifact) + r"[^.!?\n]*[.!?]?\s*", re.IGNORECASE)
+        if pattern.search(text):
+            changes.append(f'Removed "{artifact}" sentence')
+            text = pattern.sub("", text)
+    return text, changes
+
+def strip_markdown(text: str) -> tuple[str, list]:
+    changes = []
+    if "**" in text:
+        text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+        changes.append("Stripped bold")
+    if re.search(r'^#{1,6}\s', text, re.MULTILINE):
+        text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
+        changes.append("Stripped headers")
+    if "```" in text:
+        text = re.sub(r'```\w*\n?', '', text)
+        changes.append("Stripped code blocks")
+    return text, changes
+
+def reduce_em_dashes(text: str) -> tuple[str, int]:
+    count = text.count("—") + text.count(" -- ")
+    text = re.sub(r"\s*—\s*", ", ", text)
+    text = re.sub(r"\s+--\s+", ", ", text)
+    return text, count
+
+def remove_citations(text: str) -> tuple[str, list]:
+    changes = []
+    patterns = [
+        (r'\[oai_citation:\d+[^\]]*\]\([^)]+\)', "oai_citation"),
+        (r':contentReference\[oaicite:\d+\]\{[^}]+\}', "contentReference"),
+        (r'turn0search\d+', "turn0search"), (r'turn0image\d+', "turn0image"),
+        (r'\?utm_source=(chatgpt\.com|openai)', "ChatGPT UTM"),
+    ]
+    for pattern, name in patterns:
+        if re.search(pattern, text):
+            text = re.sub(pattern, '', text)
+            changes.append(f"Removed {name}")
+    return text, changes
+
+def simplify_ing(text: str) -> tuple[str, list]:
+    changes = []
+    for word in ["highlighting", "underscoring", "emphasizing", "showcasing", "fostering"]:
+        pattern = re.compile(rf',?\s*{word}\s+[^,.]+[,.]', re.IGNORECASE)
+        if pattern.search(text):
+            text = pattern.sub('. ', text)
+            changes.append(f"Simplified {word} clause")
+    return text, changes
+
+def clean(text: str) -> str:
+    text = re.sub(r" +", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r",\s*,", ",", text)
+    text = re.sub(r"(^|[.!?]\s+)([a-z])", lambda m: m.group(1) + m.group(2).upper(), text)
+    return text.strip()
+
+def transform(text: str, aggressive: bool = False) -> tuple[str, list]:
+    all_changes = []
+    text, changes = remove_citations(text); all_changes.extend(changes)
+    text, changes = strip_markdown(text); all_changes.extend(changes)
+    text, changes = remove_chatbot_sentences(text); all_changes.extend(changes)
+    text, changes = apply_replacements(text, PATTERNS["copula_avoidance"]); all_changes.extend(changes)
+    text, changes = apply_replacements(text, PATTERNS["filler_replacements"]); all_changes.extend(changes)
+    text, fixed = fix_quotes(text)
+    if fixed:
+        all_changes.append("Fixed curly quotes")
+    if aggressive:
+        text, changes = simplify_ing(text); all_changes.extend(changes)
+        text, count = reduce_em_dashes(text)
+        if count > 2:
+            all_changes.append(f"Replaced {count} em dashes")
+    return clean(text), all_changes
+
+def main():
+    parser = argparse.ArgumentParser(description="Transform AI text to human-like")
+    parser.add_argument("input", nargs="?", help="Input file (or stdin)")
+    parser.add_argument("-o", "--output", help="Output file")
+    parser.add_argument("-a", "--aggressive", action="store_true", help="Aggressive mode")
+    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress change log")
+    args = parser.parse_args()
+    
+    text = Path(args.input).read_text() if args.input else sys.stdin.read()
+    result, changes = transform(text, aggressive=args.aggressive)
+    
+    if not args.quiet and changes:
+        print(f"CHANGES ({len(changes)}):", file=sys.stderr)
+        for c in changes:
+            print(f"  • {c}", file=sys.stderr)
+    
+    if args.output:
+        Path(args.output).write_text(result)
+        if not args.quiet:
+            print(f"→ {args.output}", file=sys.stderr)
+    else:
+        print(result)
+
+if __name__ == "__main__":
+    main()