Initial commit with translated description
This commit is contained in:
58
scripts/compare.py
Normal file
58
scripts/compare.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare before/after transformation with side-by-side detection scores."""
|
||||
import argparse, sys
|
||||
from pathlib import Path
|
||||
from detect import detect
|
||||
from transform import transform
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Compare AI detection before/after transformation")
|
||||
parser.add_argument("input", nargs="?", help="Input file (or stdin)")
|
||||
parser.add_argument("-a", "--aggressive", action="store_true", help="Use aggressive mode")
|
||||
parser.add_argument("-o", "--output", help="Save transformed text to file")
|
||||
args = parser.parse_args()
|
||||
|
||||
text = Path(args.input).read_text() if args.input else sys.stdin.read()
|
||||
|
||||
before = detect(text)
|
||||
transformed, changes = transform(text, aggressive=args.aggressive)
|
||||
after = detect(transformed)
|
||||
|
||||
icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("BEFORE → AFTER COMPARISON")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
print(f"{'Metric':<25} {'Before':<15} {'After':<15} {'Change':<10}")
|
||||
print(f"{'-'*60}")
|
||||
|
||||
issue_diff = after.total_issues - before.total_issues
|
||||
issue_sign = "+" if issue_diff > 0 else ""
|
||||
print(f"{'Issues':<25} {before.total_issues:<15} {after.total_issues:<15} {issue_sign}{issue_diff}")
|
||||
|
||||
print(f"{'AI Probability':<25} {icons.get(before.ai_probability,'')} {before.ai_probability:<12} {icons.get(after.ai_probability,'')} {after.ai_probability:<12}")
|
||||
print(f"{'Word Count':<25} {before.word_count:<15} {after.word_count:<15} {after.word_count - before.word_count:+}")
|
||||
|
||||
if changes:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TRANSFORMATIONS ({len(changes)})")
|
||||
print(f"{'='*60}")
|
||||
for c in changes:
|
||||
print(f" • {c}")
|
||||
|
||||
reduction = before.total_issues - after.total_issues
|
||||
if reduction > 0:
|
||||
pct = (reduction / before.total_issues * 100) if before.total_issues else 0
|
||||
print(f"\n✓ Reduced {reduction} issues ({pct:.0f}% improvement)")
|
||||
elif reduction < 0:
|
||||
print(f"\n⚠ Issues increased by {-reduction}")
|
||||
else:
|
||||
print(f"\n— No change in issue count")
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(transformed)
|
||||
print(f"\n→ Saved to {args.output}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
160
scripts/detect.py
Normal file
160
scripts/detect.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Detect AI patterns in text based on Wikipedia's Signs of AI Writing."""
|
||||
import argparse, json, re, sys
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
|
||||
|
||||
@dataclass
|
||||
class DetectionResult:
|
||||
significance_inflation: list = field(default_factory=list)
|
||||
notability_emphasis: list = field(default_factory=list)
|
||||
superficial_analysis: list = field(default_factory=list)
|
||||
promotional_language: list = field(default_factory=list)
|
||||
vague_attributions: list = field(default_factory=list)
|
||||
challenges_formula: list = field(default_factory=list)
|
||||
ai_vocabulary: list = field(default_factory=list)
|
||||
copula_avoidance: list = field(default_factory=list)
|
||||
filler_phrases: list = field(default_factory=list)
|
||||
chatbot_artifacts: list = field(default_factory=list)
|
||||
hedging_phrases: list = field(default_factory=list)
|
||||
negative_parallelisms: list = field(default_factory=list)
|
||||
rule_of_three: list = field(default_factory=list)
|
||||
markdown_artifacts: list = field(default_factory=list)
|
||||
citation_bugs: list = field(default_factory=list)
|
||||
knowledge_cutoff: list = field(default_factory=list)
|
||||
curly_quotes: int = 0
|
||||
em_dashes: int = 0
|
||||
total_issues: int = 0
|
||||
ai_probability: str = "low"
|
||||
word_count: int = 0
|
||||
|
||||
def find_matches(text: str, patterns: list) -> list:
|
||||
matches, lower = [], text.lower()
|
||||
for p in patterns:
|
||||
count = lower.count(p.lower())
|
||||
if count > 0:
|
||||
matches.append((p, count))
|
||||
return sorted(matches, key=lambda x: -x[1])
|
||||
|
||||
def detect(text: str) -> DetectionResult:
|
||||
r = DetectionResult()
|
||||
r.word_count = len(text.split())
|
||||
r.significance_inflation = find_matches(text, PATTERNS["significance_inflation"])
|
||||
r.notability_emphasis = find_matches(text, PATTERNS["notability_emphasis"])
|
||||
r.superficial_analysis = find_matches(text, PATTERNS["superficial_analysis"])
|
||||
r.promotional_language = find_matches(text, PATTERNS["promotional_language"])
|
||||
r.vague_attributions = find_matches(text, PATTERNS["vague_attributions"])
|
||||
r.challenges_formula = find_matches(text, PATTERNS["challenges_formula"])
|
||||
r.ai_vocabulary = find_matches(text, PATTERNS["ai_vocabulary"])
|
||||
r.copula_avoidance = find_matches(text, list(PATTERNS["copula_avoidance"].keys()))
|
||||
r.filler_phrases = find_matches(text, list(PATTERNS["filler_replacements"].keys()))
|
||||
r.chatbot_artifacts = find_matches(text, PATTERNS["chatbot_artifacts"])
|
||||
r.hedging_phrases = find_matches(text, PATTERNS["hedging_phrases"])
|
||||
r.negative_parallelisms = find_matches(text, PATTERNS["negative_parallelisms"])
|
||||
r.rule_of_three = find_matches(text, PATTERNS["rule_of_three_patterns"])
|
||||
r.markdown_artifacts = find_matches(text, PATTERNS["markdown_artifacts"])
|
||||
r.citation_bugs = find_matches(text, PATTERNS["citation_bugs"])
|
||||
r.knowledge_cutoff = find_matches(text, PATTERNS["knowledge_cutoff"])
|
||||
r.curly_quotes = len(re.findall(r'[""'']', text))
|
||||
r.em_dashes = text.count("—") + text.count(" -- ")
|
||||
|
||||
r.total_issues = (
|
||||
sum(c for _, c in r.significance_inflation) + sum(c for _, c in r.notability_emphasis) +
|
||||
sum(c for _, c in r.superficial_analysis) + sum(c for _, c in r.promotional_language) +
|
||||
sum(c for _, c in r.vague_attributions) + sum(c for _, c in r.challenges_formula) +
|
||||
sum(c for _, c in r.ai_vocabulary) + sum(c for _, c in r.copula_avoidance) +
|
||||
sum(c for _, c in r.filler_phrases) + sum(c for _, c in r.chatbot_artifacts) * 3 +
|
||||
sum(c for _, c in r.hedging_phrases) + sum(c for _, c in r.negative_parallelisms) +
|
||||
sum(c for _, c in r.markdown_artifacts) * 2 + sum(c for _, c in r.citation_bugs) * 5 +
|
||||
sum(c for _, c in r.knowledge_cutoff) * 3 + r.curly_quotes + (r.em_dashes if r.em_dashes > 3 else 0)
|
||||
)
|
||||
|
||||
density = r.total_issues / max(r.word_count, 1) * 100
|
||||
if r.citation_bugs or r.knowledge_cutoff or r.chatbot_artifacts:
|
||||
r.ai_probability = "very high"
|
||||
elif density > 5 or r.total_issues > 30:
|
||||
r.ai_probability = "high"
|
||||
elif density > 2 or r.total_issues > 15:
|
||||
r.ai_probability = "medium"
|
||||
return r
|
||||
|
||||
def print_section(title: str, items: list, replacements: dict = None):
|
||||
if not items:
|
||||
return
|
||||
print(f"{title}:")
|
||||
for phrase, count in items:
|
||||
if replacements and phrase in replacements:
|
||||
repl = replacements[phrase]
|
||||
arrow = f' → "{repl}"' if repl else " → (remove)"
|
||||
print(f" • \"{phrase}\"{arrow}: {count}x")
|
||||
else:
|
||||
print(f" • {phrase}: {count}x")
|
||||
print()
|
||||
|
||||
def print_report(r: DetectionResult):
|
||||
icons = {"very high": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}
|
||||
print(f"\n{'='*60}")
|
||||
print(f"AI DETECTION SCAN - {r.total_issues} issues ({r.word_count} words)")
|
||||
print(f"AI Probability: {icons.get(r.ai_probability, '')} {r.ai_probability.upper()}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if r.citation_bugs:
|
||||
print("⚠️ CRITICAL: CHATGPT CITATION BUGS")
|
||||
print_section("Citation Artifacts", r.citation_bugs)
|
||||
if r.knowledge_cutoff:
|
||||
print("⚠️ CRITICAL: KNOWLEDGE CUTOFF PHRASES")
|
||||
print_section("Cutoff Phrases", r.knowledge_cutoff)
|
||||
if r.chatbot_artifacts:
|
||||
print("⚠️ HIGH: CHATBOT ARTIFACTS")
|
||||
print_section("Artifacts", r.chatbot_artifacts)
|
||||
if r.markdown_artifacts:
|
||||
print("⚠️ MARKDOWN DETECTED")
|
||||
print_section("Markdown", r.markdown_artifacts)
|
||||
|
||||
print_section("SIGNIFICANCE INFLATION", r.significance_inflation)
|
||||
print_section("PROMOTIONAL LANGUAGE", r.promotional_language)
|
||||
print_section("AI VOCABULARY", r.ai_vocabulary)
|
||||
print_section("SUPERFICIAL -ING", r.superficial_analysis)
|
||||
print_section("COPULA AVOIDANCE", r.copula_avoidance, PATTERNS["copula_avoidance"])
|
||||
print_section("FILLER PHRASES", r.filler_phrases, PATTERNS["filler_replacements"])
|
||||
print_section("VAGUE ATTRIBUTIONS", r.vague_attributions)
|
||||
print_section("CHALLENGES FORMULA", r.challenges_formula)
|
||||
print_section("HEDGING", r.hedging_phrases)
|
||||
print_section("NEGATIVE PARALLELISMS", r.negative_parallelisms)
|
||||
print_section("NOTABILITY EMPHASIS", r.notability_emphasis)
|
||||
|
||||
if r.curly_quotes:
|
||||
print(f"CURLY QUOTES: {r.curly_quotes} (ChatGPT signature)\n")
|
||||
if r.em_dashes > 3:
|
||||
print(f"EM DASHES: {r.em_dashes} (excessive)\n")
|
||||
if r.total_issues == 0:
|
||||
print("✓ No AI patterns detected.\n")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Detect AI patterns in text")
|
||||
parser.add_argument("input", nargs="?", help="Input file (or stdin)")
|
||||
parser.add_argument("--json", "-j", action="store_true", help="JSON output")
|
||||
parser.add_argument("--score-only", "-s", action="store_true", help="Score and probability only")
|
||||
args = parser.parse_args()
|
||||
|
||||
text = Path(args.input).read_text() if args.input else sys.stdin.read()
|
||||
result = detect(text)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps({
|
||||
"total_issues": result.total_issues, "word_count": result.word_count,
|
||||
"ai_probability": result.ai_probability, "significance_inflation": result.significance_inflation,
|
||||
"promotional_language": result.promotional_language, "ai_vocabulary": result.ai_vocabulary,
|
||||
"chatbot_artifacts": result.chatbot_artifacts, "citation_bugs": result.citation_bugs,
|
||||
"filler_phrases": result.filler_phrases, "curly_quotes": result.curly_quotes, "em_dashes": result.em_dashes,
|
||||
}, indent=2))
|
||||
elif args.score_only:
|
||||
print(f"Issues: {result.total_issues} | Words: {result.word_count} | AI: {result.ai_probability}")
|
||||
else:
|
||||
print_report(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
191
scripts/patterns.json
Normal file
191
scripts/patterns.json
Normal file
@@ -0,0 +1,191 @@
|
||||
{
|
||||
"significance_inflation": [
|
||||
"stands as", "serves as", "is a testament", "is a reminder",
|
||||
"vital role", "significant role", "crucial role", "pivotal role",
|
||||
"key role", "pivotal moment", "key moment", "key turning point",
|
||||
"underscores its importance", "highlights its importance",
|
||||
"underscores its significance", "highlights its significance",
|
||||
"reflects broader", "symbolizing its ongoing", "symbolizing its enduring",
|
||||
"symbolizing its lasting", "contributing to the", "setting the stage for",
|
||||
"marking the", "shaping the", "represents a shift", "marks a shift",
|
||||
"evolving landscape", "focal point", "indelible mark", "deeply rooted",
|
||||
"enduring legacy", "rich tapestry", "broader movement"
|
||||
],
|
||||
"notability_emphasis": [
|
||||
"independent coverage", "local media outlets", "regional media outlets",
|
||||
"national media outlets", "music outlets", "business outlets", "tech outlets",
|
||||
"profiled in", "written by a leading expert", "active social media presence",
|
||||
"has been featured in", "has been cited in", "maintains a strong digital presence"
|
||||
],
|
||||
"superficial_analysis": [
|
||||
"highlighting", "underscoring", "emphasizing", "ensuring",
|
||||
"reflecting", "symbolizing", "contributing to", "cultivating",
|
||||
"fostering", "encompassing", "showcasing", "valuable insights",
|
||||
"align with", "aligns with", "resonate with", "resonates with"
|
||||
],
|
||||
"promotional_language": [
|
||||
"boasts a", "boasts an", "vibrant", "rich cultural heritage",
|
||||
"profound", "enhancing its", "exemplifies", "commitment to",
|
||||
"natural beauty", "nestled", "in the heart of", "groundbreaking",
|
||||
"renowned", "breathtaking", "must-visit", "stunning", "bustling",
|
||||
"game-changing", "cutting-edge", "state-of-the-art", "world-class",
|
||||
"best-in-class", "industry-leading", "innovative", "revolutionary"
|
||||
],
|
||||
"vague_attributions": [
|
||||
"industry reports", "observers have cited", "experts argue",
|
||||
"experts believe", "some critics argue", "several sources",
|
||||
"several publications", "according to experts", "widely regarded",
|
||||
"it is widely believed", "many believe", "some would say"
|
||||
],
|
||||
"challenges_formula": [
|
||||
"despite its", "faces several challenges", "despite these challenges",
|
||||
"challenges and legacy", "future outlook", "future prospects",
|
||||
"looking ahead", "moving forward", "going forward"
|
||||
],
|
||||
"ai_vocabulary": [
|
||||
"additionally", "align with", "crucial", "delve", "emphasizing",
|
||||
"enduring", "enhance", "fostering", "garner", "highlight",
|
||||
"interplay", "intricate", "intricacies", "key", "landscape",
|
||||
"pivotal", "showcase", "showcasing", "tapestry", "testament",
|
||||
"underscore", "underscores", "valuable", "vibrant", "nuanced",
|
||||
"multifaceted", "paradigm", "synergy", "realm", "underpins",
|
||||
"unraveling", "unveiling", "leveraging", "furthermore", "moreover",
|
||||
"consequently", "subsequently", "henceforth", "thereby", "wherein",
|
||||
"thereof", "whatsoever", "nevertheless", "notwithstanding"
|
||||
],
|
||||
"copula_avoidance": {
|
||||
"serves as a": "is a",
|
||||
"serves as an": "is an",
|
||||
"serves as the": "is the",
|
||||
"stands as a": "is a",
|
||||
"stands as an": "is an",
|
||||
"stands as the": "is the",
|
||||
"marks a": "is a",
|
||||
"marks an": "is an",
|
||||
"marks the": "is the",
|
||||
"represents a": "is a",
|
||||
"represents an": "is an",
|
||||
"represents the": "is the",
|
||||
"boasts a": "has a",
|
||||
"boasts an": "has an",
|
||||
"boasts the": "has the",
|
||||
"features a": "has a",
|
||||
"features an": "has an",
|
||||
"features the": "has the",
|
||||
"offers a": "has a",
|
||||
"offers an": "has an"
|
||||
},
|
||||
"filler_replacements": {
|
||||
"in order to": "to",
|
||||
"due to the fact that": "because",
|
||||
"at this point in time": "now",
|
||||
"at the present time": "now",
|
||||
"has the ability to": "can",
|
||||
"it is important to note that": "",
|
||||
"it should be noted that": "",
|
||||
"it is worth noting that": "",
|
||||
"it is crucial to note that": "",
|
||||
"it is critical to remember that": "",
|
||||
"it goes without saying that": "",
|
||||
"needless to say": "",
|
||||
"Additionally,": "",
|
||||
"Furthermore,": "",
|
||||
"Moreover,": "",
|
||||
"In conclusion,": "",
|
||||
"To summarize,": "",
|
||||
"In summary,": "",
|
||||
"Overall,": "",
|
||||
"utilize": "use",
|
||||
"utilizes": "uses",
|
||||
"utilizing": "using",
|
||||
"utilization": "use",
|
||||
"leverage": "use",
|
||||
"leverages": "uses",
|
||||
"leveraging": "using",
|
||||
"facilitate": "help",
|
||||
"facilitates": "helps",
|
||||
"facilitating": "helping",
|
||||
"implement": "add",
|
||||
"implements": "adds",
|
||||
"prioritize": "focus on",
|
||||
"prioritizes": "focuses on",
|
||||
"optimize": "improve",
|
||||
"optimizes": "improves",
|
||||
"streamline": "simplify",
|
||||
"streamlines": "simplifies"
|
||||
},
|
||||
"chatbot_artifacts": [
|
||||
"I hope this helps",
|
||||
"Let me know if",
|
||||
"Would you like me to",
|
||||
"Great question",
|
||||
"Excellent question",
|
||||
"You're absolutely right",
|
||||
"That's a great point",
|
||||
"That's an excellent point",
|
||||
"Certainly!",
|
||||
"Of course!",
|
||||
"Absolutely!",
|
||||
"Happy to help",
|
||||
"I'd be happy to",
|
||||
"Feel free to",
|
||||
"Don't hesitate to",
|
||||
"Here is a",
|
||||
"Here's a",
|
||||
"I can help you with",
|
||||
"As an AI",
|
||||
"As a language model",
|
||||
"As an AI language model"
|
||||
],
|
||||
"hedging_phrases": [
|
||||
"it could potentially",
|
||||
"it might possibly",
|
||||
"arguably",
|
||||
"it could be argued that",
|
||||
"some would say",
|
||||
"in some ways",
|
||||
"to some extent",
|
||||
"in certain respects",
|
||||
"may vary",
|
||||
"results may vary"
|
||||
],
|
||||
"negative_parallelisms": [
|
||||
"not only", "but also",
|
||||
"it's not just about", "it's about",
|
||||
"it is not merely", "it is",
|
||||
"not just", "but",
|
||||
"no longer", "instead"
|
||||
],
|
||||
"rule_of_three_patterns": [
|
||||
"innovation, inspiration, and",
|
||||
"engage, educate, and",
|
||||
"plan, execute, and",
|
||||
"design, develop, and",
|
||||
"research, develop, and",
|
||||
"create, collaborate, and",
|
||||
"learn, grow, and"
|
||||
],
|
||||
"markdown_artifacts": [
|
||||
"**", "##", "###", "```",
|
||||
"* **", "- **", "1. **"
|
||||
],
|
||||
"citation_bugs": [
|
||||
"oaicite", "oai_citation", "contentReference",
|
||||
"turn0search", "turn0image", "utm_source=chatgpt",
|
||||
"utm_source=openai", "attached_file", "grok_card"
|
||||
],
|
||||
"knowledge_cutoff": [
|
||||
"as of my last", "as of my knowledge",
|
||||
"up to my last training", "based on available information",
|
||||
"while specific details are limited",
|
||||
"not widely available", "not widely documented",
|
||||
"in the provided sources", "in available sources"
|
||||
],
|
||||
"curly_quotes": {
|
||||
"\u201c": "\"",
|
||||
"\u201d": "\"",
|
||||
"\u2018": "'",
|
||||
"\u2019": "'"
|
||||
},
|
||||
"em_dash_patterns": ["—", " — ", "-- "]
|
||||
}
|
||||
127
scripts/transform.py
Normal file
127
scripts/transform.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Transform AI text to bypass detection."""
|
||||
import argparse, json, re, sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PATTERNS = json.loads((SCRIPT_DIR / "patterns.json").read_text())
|
||||
|
||||
def replace_bounded(text: str, old: str, new: str) -> tuple[str, int]:
|
||||
pattern = re.compile(re.escape(old), re.IGNORECASE) if " " in old or old.endswith(",") else re.compile(r"\b" + re.escape(old) + r"\b", re.IGNORECASE)
|
||||
matches = pattern.findall(text)
|
||||
return pattern.sub(new, text) if matches else text, len(matches)
|
||||
|
||||
def apply_replacements(text: str, replacements: dict) -> tuple[str, list]:
|
||||
changes = []
|
||||
for old, new in replacements.items():
|
||||
text, count = replace_bounded(text, old, new)
|
||||
if count:
|
||||
changes.append(f'"{old}" → "{new}"' if new else f'"{old}" removed')
|
||||
return text, changes
|
||||
|
||||
def fix_quotes(text: str) -> tuple[str, bool]:
|
||||
original = text
|
||||
for old, new in PATTERNS["curly_quotes"].items():
|
||||
text = text.replace(old, new)
|
||||
return text, text != original
|
||||
|
||||
def remove_chatbot_sentences(text: str) -> tuple[str, list]:
|
||||
changes = []
|
||||
for artifact in PATTERNS["chatbot_artifacts"]:
|
||||
pattern = re.compile(r"[^.!?\n]*" + re.escape(artifact) + r"[^.!?\n]*[.!?]?\s*", re.IGNORECASE)
|
||||
if pattern.search(text):
|
||||
changes.append(f'Removed "{artifact}" sentence')
|
||||
text = pattern.sub("", text)
|
||||
return text, changes
|
||||
|
||||
def strip_markdown(text: str) -> tuple[str, list]:
|
||||
changes = []
|
||||
if "**" in text:
|
||||
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
||||
changes.append("Stripped bold")
|
||||
if re.search(r'^#{1,6}\s', text, re.MULTILINE):
|
||||
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
||||
changes.append("Stripped headers")
|
||||
if "```" in text:
|
||||
text = re.sub(r'```\w*\n?', '', text)
|
||||
changes.append("Stripped code blocks")
|
||||
return text, changes
|
||||
|
||||
def reduce_em_dashes(text: str) -> tuple[str, int]:
|
||||
count = text.count("—") + text.count(" -- ")
|
||||
text = re.sub(r"\s*—\s*", ", ", text)
|
||||
text = re.sub(r"\s+--\s+", ", ", text)
|
||||
return text, count
|
||||
|
||||
def remove_citations(text: str) -> tuple[str, list]:
|
||||
changes = []
|
||||
patterns = [
|
||||
(r'\[oai_citation:\d+[^\]]*\]\([^)]+\)', "oai_citation"),
|
||||
(r':contentReference\[oaicite:\d+\]\{[^}]+\}', "contentReference"),
|
||||
(r'turn0search\d+', "turn0search"), (r'turn0image\d+', "turn0image"),
|
||||
(r'\?utm_source=(chatgpt\.com|openai)', "ChatGPT UTM"),
|
||||
]
|
||||
for pattern, name in patterns:
|
||||
if re.search(pattern, text):
|
||||
text = re.sub(pattern, '', text)
|
||||
changes.append(f"Removed {name}")
|
||||
return text, changes
|
||||
|
||||
def simplify_ing(text: str) -> tuple[str, list]:
|
||||
changes = []
|
||||
for word in ["highlighting", "underscoring", "emphasizing", "showcasing", "fostering"]:
|
||||
pattern = re.compile(rf',?\s*{word}\s+[^,.]+[,.]', re.IGNORECASE)
|
||||
if pattern.search(text):
|
||||
text = pattern.sub('. ', text)
|
||||
changes.append(f"Simplified {word} clause")
|
||||
return text, changes
|
||||
|
||||
def clean(text: str) -> str:
|
||||
text = re.sub(r" +", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = re.sub(r",\s*,", ",", text)
|
||||
text = re.sub(r"(^|[.!?]\s+)([a-z])", lambda m: m.group(1) + m.group(2).upper(), text)
|
||||
return text.strip()
|
||||
|
||||
def transform(text: str, aggressive: bool = False) -> tuple[str, list]:
|
||||
all_changes = []
|
||||
text, changes = remove_citations(text); all_changes.extend(changes)
|
||||
text, changes = strip_markdown(text); all_changes.extend(changes)
|
||||
text, changes = remove_chatbot_sentences(text); all_changes.extend(changes)
|
||||
text, changes = apply_replacements(text, PATTERNS["copula_avoidance"]); all_changes.extend(changes)
|
||||
text, changes = apply_replacements(text, PATTERNS["filler_replacements"]); all_changes.extend(changes)
|
||||
text, fixed = fix_quotes(text)
|
||||
if fixed:
|
||||
all_changes.append("Fixed curly quotes")
|
||||
if aggressive:
|
||||
text, changes = simplify_ing(text); all_changes.extend(changes)
|
||||
text, count = reduce_em_dashes(text)
|
||||
if count > 2:
|
||||
all_changes.append(f"Replaced {count} em dashes")
|
||||
return clean(text), all_changes
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Transform AI text to human-like")
|
||||
parser.add_argument("input", nargs="?", help="Input file (or stdin)")
|
||||
parser.add_argument("-o", "--output", help="Output file")
|
||||
parser.add_argument("-a", "--aggressive", action="store_true", help="Aggressive mode")
|
||||
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress change log")
|
||||
args = parser.parse_args()
|
||||
|
||||
text = Path(args.input).read_text() if args.input else sys.stdin.read()
|
||||
result, changes = transform(text, aggressive=args.aggressive)
|
||||
|
||||
if not args.quiet and changes:
|
||||
print(f"CHANGES ({len(changes)}):", file=sys.stderr)
|
||||
for c in changes:
|
||||
print(f" • {c}", file=sys.stderr)
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(result)
|
||||
if not args.quiet:
|
||||
print(f"→ {args.output}", file=sys.stderr)
|
||||
else:
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user