Initial commit with translated description
This commit is contained in:
67
SKILL.md
Normal file
67
SKILL.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
---
|
||||||
|
name: video-subtitles
|
||||||
|
description: "从视频/音频生成SRT字幕,支持翻译。"
|
||||||
|
---
|
||||||
|
|
||||||
|
# Video Subtitles
|
||||||
|
|
||||||
|
Generate movie-style subtitles from video or audio files. Supports transcription, translation, and burning subtitles directly into video.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Hebrew**: ivrit.ai fine-tuned model (best Hebrew transcription)
|
||||||
|
- **English**: OpenAI Whisper large-v3
|
||||||
|
- **Auto-detect**: Automatically detects language and selects best model
|
||||||
|
- **Translation**: Translate Hebrew → English
|
||||||
|
- **Burn-in**: Hardcode subtitles into video (visible everywhere, including WhatsApp)
|
||||||
|
- **Movie-style**: Natural subtitle breaks (42 chars/line, 1-7s duration)
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Plain transcript
|
||||||
|
./scripts/generate_srt.py video.mp4
|
||||||
|
|
||||||
|
# Generate SRT file
|
||||||
|
./scripts/generate_srt.py video.mp4 --srt
|
||||||
|
|
||||||
|
# Burn subtitles into video (always visible)
|
||||||
|
./scripts/generate_srt.py video.mp4 --srt --burn
|
||||||
|
|
||||||
|
# Translate to English + burn in
|
||||||
|
./scripts/generate_srt.py video.mp4 --srt --burn --translate en
|
||||||
|
|
||||||
|
# Force language
|
||||||
|
./scripts/generate_srt.py video.mp4 --lang he # Hebrew
|
||||||
|
./scripts/generate_srt.py video.mp4 --lang en # English
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `--srt` | Generate SRT subtitle file |
|
||||||
|
| `--burn` | Burn subtitles into video (hardcoded, always visible) |
|
||||||
|
| `--embed` | Embed soft subtitles (toggle in player) |
|
||||||
|
| `--translate en` | Translate to English |
|
||||||
|
| `--lang he/en` | Force input language |
|
||||||
|
| `-o FILE` | Custom output path |
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
- **Default**: Plain text transcript to stdout
|
||||||
|
- **With `--srt`**: Creates `video.srt` alongside input
|
||||||
|
- **With `--burn`**: Creates `video_subtitled.mp4` with hardcoded subs
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **uv**: Python package manager (auto-installs dependencies)
|
||||||
|
- **ffmpeg-full**: For burning subtitles (`brew install ffmpeg-full`)
|
||||||
|
- **Models**: ~3GB each, auto-downloaded on first use
|
||||||
|
|
||||||
|
## Subtitle Style
|
||||||
|
|
||||||
|
- Font size 12, white text with black outline
|
||||||
|
- Bottom-aligned, movie-style positioning
|
||||||
|
- Max 42 chars/line, 2 lines max
|
||||||
|
- Natural breaks at punctuation and pauses
|
||||||
6
_meta.json
Normal file
6
_meta.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"ownerId": "kn75ksgxr9g6p72nqq7e6wm2ex7zpt7w",
|
||||||
|
"slug": "video-subtitles",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"publishedAt": 1769095508613
|
||||||
|
}
|
||||||
309
scripts/generate_srt.py
Normal file
309
scripts/generate_srt.py
Normal file
@@ -0,0 +1,309 @@
|
|||||||
|
#!/usr/bin/env -S uv run --script --quiet
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.10"
|
||||||
|
# dependencies = [
|
||||||
|
# "faster-whisper>=1.0.0",
|
||||||
|
# ]
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
Transcription with subtitles. Hebrew uses ivrit.ai, English uses whisper large-v3.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./transcribe.py video.mp4 # Plain transcript
|
||||||
|
./transcribe.py video.mp4 --srt # Generate SRT file
|
||||||
|
./transcribe.py video.mp4 --srt --embed # Burn subtitles into video
|
||||||
|
./transcribe.py video.mp4 --srt --translate en # Translate to English
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Subtitle:
|
||||||
|
index: int
|
||||||
|
start: float
|
||||||
|
end: float
|
||||||
|
text: str
|
||||||
|
|
||||||
|
def to_srt(self) -> str:
|
||||||
|
return f"{self.index}\n{format_srt_timestamp(self.start)} --> {format_srt_timestamp(self.end)}\n{self.text}\n"
|
||||||
|
|
||||||
|
|
||||||
|
def format_srt_timestamp(seconds: float) -> str:
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
secs = int(seconds % 60)
|
||||||
|
millis = int((seconds % 1) * 1000)
|
||||||
|
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text_naturally(text: str, max_chars: int = 42) -> list[str]:
|
||||||
|
text = text.strip()
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
break_points = ['. ', ', ', '? ', '! ', ': ', '; ', ' - ', ' – ', ' — ']
|
||||||
|
lines = []
|
||||||
|
remaining = text
|
||||||
|
|
||||||
|
while remaining:
|
||||||
|
if len(remaining) <= max_chars:
|
||||||
|
lines.append(remaining)
|
||||||
|
break
|
||||||
|
|
||||||
|
best_break = -1
|
||||||
|
for bp in break_points:
|
||||||
|
idx = remaining[:max_chars].rfind(bp)
|
||||||
|
if idx > best_break:
|
||||||
|
best_break = idx + len(bp) - 1
|
||||||
|
|
||||||
|
if best_break <= 0:
|
||||||
|
best_break = remaining[:max_chars].rfind(' ')
|
||||||
|
if best_break <= 0:
|
||||||
|
best_break = max_chars
|
||||||
|
|
||||||
|
lines.append(remaining[:best_break].strip())
|
||||||
|
remaining = remaining[best_break:].strip()
|
||||||
|
|
||||||
|
if len(lines) > 2:
|
||||||
|
lines = [lines[0], ' '.join(lines[1:])]
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def merge_into_subtitles(segments: list, min_duration: float = 1.0, max_duration: float = 7.0, max_chars: int = 84) -> list[Subtitle]:
|
||||||
|
if not segments:
|
||||||
|
return []
|
||||||
|
|
||||||
|
subtitles = []
|
||||||
|
current_text = ""
|
||||||
|
current_start = None
|
||||||
|
current_end = None
|
||||||
|
|
||||||
|
for seg in segments:
|
||||||
|
seg_text = seg.text.strip()
|
||||||
|
seg_start = seg.start
|
||||||
|
seg_end = seg.end
|
||||||
|
|
||||||
|
if current_start is None:
|
||||||
|
current_start = seg_start
|
||||||
|
current_end = seg_end
|
||||||
|
current_text = seg_text
|
||||||
|
continue
|
||||||
|
|
||||||
|
potential_text = current_text + " " + seg_text
|
||||||
|
potential_duration = seg_end - current_start
|
||||||
|
gap = seg_start - current_end
|
||||||
|
|
||||||
|
should_merge = (
|
||||||
|
(potential_duration <= max_duration and len(potential_text) <= max_chars and (current_end - current_start) < min_duration) or
|
||||||
|
(gap < 0.3 and potential_duration <= max_duration and len(potential_text) <= max_chars)
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_merge:
|
||||||
|
current_text = potential_text
|
||||||
|
current_end = seg_end
|
||||||
|
else:
|
||||||
|
lines = chunk_text_naturally(current_text)
|
||||||
|
subtitles.append(Subtitle(len(subtitles) + 1, current_start, current_end, '\n'.join(lines)))
|
||||||
|
current_start = seg_start
|
||||||
|
current_end = seg_end
|
||||||
|
current_text = seg_text
|
||||||
|
|
||||||
|
if current_text:
|
||||||
|
lines = chunk_text_naturally(current_text)
|
||||||
|
subtitles.append(Subtitle(len(subtitles) + 1, current_start, current_end, '\n'.join(lines)))
|
||||||
|
|
||||||
|
return subtitles
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe(file_path: str, language: str | None = None, use_turbo: bool = True,
|
||||||
|
generate_srt: bool = False, translate_to: str | None = None):
|
||||||
|
"""Transcribe audio/video file. Auto-detects language, uses ivrit.ai for Hebrew."""
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
# Determine task: transcribe or translate
|
||||||
|
task = "translate" if translate_to == "en" else "transcribe"
|
||||||
|
|
||||||
|
# Select model based on language and task
|
||||||
|
if task == "translate":
|
||||||
|
# Translation requires large-v3 (turbo doesn't translate well)
|
||||||
|
model_name = "large-v3"
|
||||||
|
# Use int8 quantization on CPU to save memory
|
||||||
|
print(f"📦 Loading model: {model_name} (int8 for translation)...", file=sys.stderr)
|
||||||
|
model = WhisperModel(model_name, device="cpu", compute_type="int8")
|
||||||
|
elif language == "he":
|
||||||
|
model_name = "ivrit-ai/whisper-large-v3-turbo-ct2" if use_turbo else "ivrit-ai/whisper-large-v3-ct2"
|
||||||
|
print(f"📦 Loading model: {model_name}...", file=sys.stderr)
|
||||||
|
model = WhisperModel(model_name, device="auto", compute_type="auto")
|
||||||
|
else:
|
||||||
|
model_name = "large-v3-turbo" if use_turbo else "large-v3"
|
||||||
|
print(f"📦 Loading model: {model_name}...", file=sys.stderr)
|
||||||
|
model = WhisperModel(model_name, device="auto", compute_type="auto")
|
||||||
|
|
||||||
|
print(f"🎤 Transcribing: {file_path}...", file=sys.stderr)
|
||||||
|
if task == "translate":
|
||||||
|
print(f"🌐 Translating to English...", file=sys.stderr)
|
||||||
|
|
||||||
|
segments, info = model.transcribe(
|
||||||
|
file_path,
|
||||||
|
language=language,
|
||||||
|
task=task,
|
||||||
|
word_timestamps=generate_srt,
|
||||||
|
vad_filter=generate_srt,
|
||||||
|
)
|
||||||
|
|
||||||
|
detected_lang = info.language
|
||||||
|
print(f"✓ Detected: {detected_lang} (confidence: {info.language_probability:.0%})", file=sys.stderr)
|
||||||
|
|
||||||
|
# If Hebrew detected but we used standard model, re-run with ivrit.ai (unless translating)
|
||||||
|
if detected_lang == "he" and language is None and "ivrit-ai" not in model_name and task != "translate":
|
||||||
|
print("🔄 Hebrew detected, switching to ivrit.ai model...", file=sys.stderr)
|
||||||
|
return transcribe(file_path, language="he", use_turbo=use_turbo,
|
||||||
|
generate_srt=generate_srt, translate_to=translate_to)
|
||||||
|
|
||||||
|
raw_segments = list(segments)
|
||||||
|
|
||||||
|
if generate_srt:
|
||||||
|
subtitles = merge_into_subtitles(raw_segments)
|
||||||
|
print(f"✓ Created {len(subtitles)} subtitles", file=sys.stderr)
|
||||||
|
return '\n'.join(sub.to_srt() for sub in subtitles), subtitles[-1].end if subtitles else 0, detected_lang
|
||||||
|
|
||||||
|
return " ".join(seg.text.strip() for seg in raw_segments), None, detected_lang
|
||||||
|
|
||||||
|
|
||||||
|
def embed_subtitles(video_path: str, srt_content: str, output_path: str, burn: bool = False):
|
||||||
|
"""Embed subtitles into video using ffmpeg."""
|
||||||
|
# Use ffmpeg-full if available (has libass for burn-in)
|
||||||
|
ffmpeg_bin = "/opt/homebrew/opt/ffmpeg-full/bin/ffmpeg"
|
||||||
|
if not os.path.exists(ffmpeg_bin):
|
||||||
|
ffmpeg_bin = "ffmpeg"
|
||||||
|
|
||||||
|
# Write SRT to temp file
|
||||||
|
srt_path = "/tmp/subtitles_temp.srt"
|
||||||
|
with open(srt_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(srt_content)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if burn:
|
||||||
|
print(f"🔥 Burning subtitles into video...", file=sys.stderr)
|
||||||
|
# Hard-code (burn) subtitles into video using libass
|
||||||
|
# Style: movie-style - smaller text at bottom with outline
|
||||||
|
escaped_srt = srt_path.replace(":", "\\:")
|
||||||
|
filter_str = f"subtitles={escaped_srt}:force_style='FontSize=12,PrimaryColour=&Hffffff,OutlineColour=&H000000,BorderStyle=1,Outline=1,Shadow=0,MarginV=12,Alignment=2'"
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
ffmpeg_bin, '-y',
|
||||||
|
'-i', video_path,
|
||||||
|
'-vf', filter_str,
|
||||||
|
'-c:a', 'copy',
|
||||||
|
output_path
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
print(f"🎬 Embedding soft subtitles...", file=sys.stderr)
|
||||||
|
# Soft subtitles (selectable in player)
|
||||||
|
cmd = [
|
||||||
|
ffmpeg_bin, '-y',
|
||||||
|
'-i', video_path,
|
||||||
|
'-i', srt_path,
|
||||||
|
'-c', 'copy',
|
||||||
|
'-c:s', 'mov_text',
|
||||||
|
'-metadata:s:s:0', 'language=heb',
|
||||||
|
output_path
|
||||||
|
]
|
||||||
|
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"ffmpeg error: {result.stderr}", file=sys.stderr)
|
||||||
|
raise RuntimeError("ffmpeg failed")
|
||||||
|
|
||||||
|
print(f"✓ Video saved: {output_path}", file=sys.stderr)
|
||||||
|
if not burn:
|
||||||
|
print(f" (Soft subs - enable in player with V key)", file=sys.stderr)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(srt_path):
|
||||||
|
os.unlink(srt_path)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Transcribe audio/video (Hebrew via ivrit.ai, English via whisper)",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
%(prog)s video.mp4 # Transcribe (auto-detect language)
|
||||||
|
%(prog)s video.mp4 --lang he # Force Hebrew
|
||||||
|
%(prog)s video.mp4 --srt # Generate SRT subtitles
|
||||||
|
%(prog)s video.mp4 --srt --embed # Burn subtitles into video
|
||||||
|
%(prog)s video.mp4 --srt --translate en # Translate to English subtitles
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument("file", help="Audio or video file")
|
||||||
|
parser.add_argument("--lang", "-l", choices=["he", "en"],
|
||||||
|
help="Force language (auto-detect if not specified)")
|
||||||
|
parser.add_argument("--turbo", action="store_true", default=True,
|
||||||
|
help="Use turbo model for Hebrew (faster, default)")
|
||||||
|
parser.add_argument("--accurate", action="store_true",
|
||||||
|
help="Use accurate model for Hebrew (slower but better)")
|
||||||
|
parser.add_argument("--timestamps", "-t", action="store_true",
|
||||||
|
help="Include timestamps in plain text output")
|
||||||
|
parser.add_argument("--srt", action="store_true",
|
||||||
|
help="Generate SRT subtitle file")
|
||||||
|
parser.add_argument("--embed", action="store_true",
|
||||||
|
help="Embed soft subtitles into video (toggle in player)")
|
||||||
|
parser.add_argument("--burn", action="store_true",
|
||||||
|
help="Burn subtitles into video (always visible, for WhatsApp)")
|
||||||
|
parser.add_argument("--translate", metavar="LANG", choices=["en"],
|
||||||
|
help="Translate subtitles to language (currently: en)")
|
||||||
|
parser.add_argument("--output", "-o", help="Output file path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_path = Path(args.file)
|
||||||
|
if not input_path.exists():
|
||||||
|
print(f"❌ File not found: {args.file}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if (args.embed or args.burn) and not args.srt:
|
||||||
|
print("❌ --embed/--burn requires --srt", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
use_turbo = not args.accurate
|
||||||
|
result, duration, detected_lang = transcribe(
|
||||||
|
args.file,
|
||||||
|
language=args.lang,
|
||||||
|
use_turbo=use_turbo,
|
||||||
|
generate_srt=args.srt,
|
||||||
|
translate_to=args.translate
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine output path
|
||||||
|
if args.output:
|
||||||
|
output_path = Path(args.output)
|
||||||
|
elif args.embed or args.burn:
|
||||||
|
output_path = input_path.with_stem(input_path.stem + "_subtitled").with_suffix('.mp4')
|
||||||
|
elif args.srt:
|
||||||
|
output_path = input_path.with_suffix('.srt')
|
||||||
|
else:
|
||||||
|
output_path = None
|
||||||
|
|
||||||
|
# Handle embedding
|
||||||
|
if args.embed or args.burn:
|
||||||
|
embed_subtitles(str(input_path), result, str(output_path), burn=args.burn)
|
||||||
|
elif output_path:
|
||||||
|
output_path.write_text(result, encoding="utf-8")
|
||||||
|
print(f"✓ Saved: {output_path}", file=sys.stderr)
|
||||||
|
if duration:
|
||||||
|
print(f" Duration: {format_srt_timestamp(duration)}", file=sys.stderr)
|
||||||
|
else:
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user