commit 5cbdcdeb0175211a9dd00ca488063c1e9fe7806b Author: zlei9 Date: Sun Mar 29 08:22:58 2026 +0800 Initial commit with translated description diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..a944f61 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,49 @@ +--- +name: local-whisper +description: "使用OpenAI Whisper进行本地语音转文字。模型下载后可完全离线运行。具备多种模型尺寸的高质量转录。" +metadata: {"clawdbot":{"emoji":"🎙️","requires":{"bins":["ffmpeg"]}}} +--- + +# Local Whisper STT + +Local speech-to-text using OpenAI's Whisper. **Fully offline** after initial model download. + +## Usage + +```bash +# Basic +~/.clawdbot/skills/local-whisper/scripts/local-whisper audio.wav + +# Better model +~/.clawdbot/skills/local-whisper/scripts/local-whisper audio.wav --model turbo + +# With timestamps +~/.clawdbot/skills/local-whisper/scripts/local-whisper audio.wav --timestamps --json +``` + +## Models + +| Model | Size | Notes | +|-------|------|-------| +| `tiny` | 39M | Fastest | +| `base` | 74M | **Default** | +| `small` | 244M | Good balance | +| `turbo` | 809M | Best speed/quality | +| `large-v3` | 1.5GB | Maximum accuracy | + +## Options + +- `--model/-m` — Model size (default: base) +- `--language/-l` — Language code (auto-detect if omitted) +- `--timestamps/-t` — Include word timestamps +- `--json/-j` — JSON output +- `--quiet/-q` — Suppress progress + +## Setup + +Uses uv-managed venv at `.venv/`. To reinstall: +```bash +cd ~/.clawdbot/skills/local-whisper +uv venv .venv --python 3.12 +uv pip install --python .venv/bin/python click openai-whisper torch --index-url https://download.pytorch.org/whl/cpu +``` diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..65b4794 --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn74rm3nhtpv387m12frad6bws7z5kqr", + "slug": "local-whisper", + "version": "1.0.0", + "publishedAt": 1769159934671 +} \ No newline at end of file diff --git a/scripts/transcribe.py b/scripts/transcribe.py new file mode 100644 index 0000000..70debab --- /dev/null +++ b/scripts/transcribe.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Local speech-to-text using OpenAI Whisper (runs offline after model download).""" + +import json +import sys +import warnings + +import click + +warnings.filterwarnings("ignore") + +MODELS = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", + "medium", "medium.en", "large-v3", "turbo"] + + +@click.command() +@click.argument("audio_file", type=click.Path(exists=True)) +@click.option("-m", "--model", default="base", type=click.Choice(MODELS), help="Whisper model size") +@click.option("-l", "--language", default=None, help="Language code (auto-detect if omitted)") +@click.option("-t", "--timestamps", is_flag=True, help="Include word-level timestamps") +@click.option("-j", "--json", "as_json", is_flag=True, help="Output as JSON") +@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages") +def main(audio_file, model, language, timestamps, as_json, quiet): + """Transcribe audio using OpenAI Whisper (local).""" + try: + import whisper + except ImportError: + click.echo("Error: openai-whisper not installed", err=True) + sys.exit(1) + + if not quiet: + click.echo(f"Loading model: {model}...", err=True) + + try: + whisper_model = whisper.load_model(model) + except Exception as e: + click.echo(f"Error loading model: {e}", err=True) + sys.exit(1) + + if not quiet: + click.echo(f"Transcribing: {audio_file}...", err=True) + + try: + result = whisper_model.transcribe(audio_file, language=language, + word_timestamps=timestamps, verbose=False) + except Exception as e: + click.echo(f"Error transcribing: {e}", err=True) + sys.exit(1) + + text = result["text"].strip() + + if as_json: + output = {"text": text, "language": result.get("language", "unknown")} + if timestamps and "segments" in result: + output["segments"] = [ + {"start": s["start"], "end": s["end"], "text": s["text"], + **({"words": s["words"]} if "words" in s else {})} + for s in result["segments"] + ] + click.echo(json.dumps(output, indent=2, ensure_ascii=False)) + else: + click.echo(text) + if timestamps and "segments" in result: + click.echo("\n--- Segments ---", err=True) + for seg in result["segments"]: + click.echo(f" [{seg['start']:.2f}s - {seg['end']:.2f}s]: {seg['text']}", err=True) + + +if __name__ == "__main__": + main()