Files
steipete_video-transcript-d…/scripts/vtd.js

436 lines
13 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
import { spawn } from "node:child_process";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { YoutubeTranscript } from "youtube-transcript-plus";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function die(message, code = 1) {
process.stderr.write(String(message).trimEnd() + "\n");
process.exit(code);
}
function parseArgs(argv) {
// Tiny no-deps parser.
// - `--flag` => boolean
// - `--key value`
// - `--` => forward remaining args to yt-dlp
const positional = [];
const opts = {};
let i = 0;
while (i < argv.length) {
const a = argv[i];
if (a === "--") {
opts.extra = argv.slice(i + 1);
break;
}
if (!a.startsWith("--")) {
positional.push(a);
i += 1;
continue;
}
const key = a.slice(2);
const next = argv[i + 1];
const isValue = next !== undefined && !next.startsWith("--");
if (!isValue) {
opts[key] = true;
i += 1;
continue;
}
if (opts[key] === undefined) opts[key] = next;
else if (Array.isArray(opts[key])) opts[key].push(next);
else opts[key] = [opts[key], next];
i += 2;
}
return { positional, opts };
}
function toArray(v) {
if (v === undefined) return [];
if (Array.isArray(v)) return v;
return [v];
}
function which(cmd) {
// Avoid shelling out to `which`; keep it portable + fast.
const envPath = process.env.PATH || "";
const parts = envPath.split(path.delimiter);
for (const p of parts) {
const full = path.join(p, cmd);
if (fs.existsSync(full)) return full;
}
return null;
}
function resolveBin(name, fallback) {
return which(name) || (fallback && fs.existsSync(fallback) ? fallback : null);
}
function run(cmd, args, { cwd } = {}) {
return new Promise((resolve) => {
// Capture stdout + stderr to keep yt-dlps error context intact.
const child = spawn(cmd, args, { cwd, stdio: ["ignore", "pipe", "pipe"] });
let out = "";
child.stdout.on("data", (d) => (out += d.toString()));
child.stderr.on("data", (d) => (out += d.toString()));
child.on("close", (code) => resolve({ code, out }));
});
}
function isYouTubeUrl(url) {
return /(^https?:\/\/)?(www\.)?(youtube\.com|youtu\.be)\//i.test(url);
}
function extractYouTubeId(input) {
if (!input) return null;
const raw = String(input).trim();
if (/^[a-zA-Z0-9_-]{11}$/.test(raw)) return raw;
const m = raw.match(/(?:v=|youtu\.be\/)([a-zA-Z0-9_-]{11})/);
return m ? m[1] : null;
}
function decodeHtmlEntities(input) {
if (!input) return input;
// Some transcripts come back double-encoded (e.g. "&amp;#39;").
// Decode up to 2 passes; stop once stable.
let text = input;
for (let i = 0; i < 2; i++) {
const decoded = text
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number(dec)))
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
if (decoded === text) break;
text = decoded;
}
return text;
}
function formatTimestamp(seconds) {
const s = Math.max(0, Math.floor(seconds));
const h = Math.floor(s / 3600);
const m = Math.floor((s % 3600) / 60);
const sec = Math.floor(s % 60);
if (h > 0) return `${h}:${String(m).padStart(2, "0")}:${String(sec).padStart(2, "0")}`;
return `${m}:${String(sec).padStart(2, "0")}`;
}
function cleanSegments(segments, { keepBrackets } = {}) {
const cleaned = [];
let prev = "";
for (const seg of segments) {
const s = String(seg || "")
.replace(/\s+/g, " ")
.trim();
if (!s) continue;
// Subtitles often contain HTML-ish tags; strip them.
const withoutTags = s.replace(/<[^>]+>/g, "").trim();
const withoutBrackets = keepBrackets ? withoutTags : withoutTags.replace(/\[[^\]]*\]/g, "").trim();
const withoutCurlies = withoutBrackets.replace(/\{[^}]+\}/g, "").replace(/♪/g, "").trim();
const t = withoutCurlies.replace(/\s+/g, " ").trim();
if (!t) continue;
if (t === prev) continue;
// Dedup heuristic: captions often repeat previous line with a longer suffix.
if (prev && t.startsWith(prev)) {
const newPart = t.slice(prev.length).trim();
if (newPart) cleaned.push(newPart);
} else if (prev && t.includes(prev)) {
// Another common pattern: current line contains previous line in the middle.
const idx = t.indexOf(prev);
const newPart = (t.slice(0, idx) + t.slice(idx + prev.length)).trim();
if (newPart) cleaned.push(newPart);
} else {
cleaned.push(t);
}
prev = t;
}
return cleaned;
}
function toParagraph(segments, { keepBrackets } = {}) {
const cleaned = cleanSegments(segments, { keepBrackets });
return cleaned.join(" ").replace(/\s+/g, " ").trim();
}
function parseSrt(text) {
const lines = String(text).split(/\r?\n/);
const segments = [];
for (const line of lines) {
const l = line.trim();
if (!l) continue;
if (/^\d+$/.test(l)) continue;
if (l.includes("-->")) continue;
segments.push(l);
}
return segments;
}
function parseVtt(text) {
const lines = String(text).split(/\r?\n/);
const segments = [];
for (const line of lines) {
const l = line.trim();
if (!l) continue;
if (l === "WEBVTT") continue;
if (l.startsWith("Kind:") || l.startsWith("Language:")) continue;
if (l.includes("-->")) continue;
// cue settings like "align:start position:0%"
if (/^(align|position|size|line):/i.test(l)) continue;
// Remove inline timestamps like "<00:00:00.000>" (common in YouTube VTT).
const cleaned = l.replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "").trim();
if (cleaned) segments.push(cleaned);
}
return segments;
}
async function ytDlpSubtitlesToTemp({ url, lang, ytdlpPath, extra }) {
const ytdlp = ytdlpPath || resolveBin("yt-dlp", "/opt/homebrew/bin/yt-dlp");
if (!ytdlp) die("missing yt-dlp; install `yt-dlp` and ensure it is on PATH");
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "vtd-subs-"));
const outTemplate = path.join(tmpDir, "%(id)s.%(ext)s");
const args = [];
args.push(
"--write-sub",
"--write-auto-sub",
"--skip-download",
"--sub-lang",
lang,
"-o",
outTemplate,
);
if (extra?.length) args.push(...extra);
args.push(url);
const r = await run(ytdlp, args);
if (r.code !== 0) {
fs.rmSync(tmpDir, { recursive: true, force: true });
die(r.out.trim() || "yt-dlp subtitle download failed");
}
const files = fs
.readdirSync(tmpDir)
.map((f) => path.join(tmpDir, f))
.filter((f) => /\.(vtt|srt|ass|ttml)$/i.test(f))
.sort((a, b) => fs.statSync(b).mtimeMs - fs.statSync(a).mtimeMs);
if (files.length === 0) {
fs.rmSync(tmpDir, { recursive: true, force: true });
die(`no subtitles found (lang=${lang})`);
}
return { tmpDir, subtitlePath: files[0] };
}
async function cmdTranscript({ url, lang, timestamps, keepBrackets, extra }) {
if (!url) die("missing --url");
if (isYouTubeUrl(url)) {
const id = extractYouTubeId(url);
if (id) {
try {
// Preferred path: direct transcript fetch (no yt-dlp / no files).
const transcript = await YoutubeTranscript.fetchTranscript(id);
if (timestamps) {
for (const entry of transcript) {
const ts = formatTimestamp(entry.offset / 1000);
process.stdout.write(`[${ts}] ${decodeHtmlEntities(entry.text).replace(/\s+/g, " ").trim()}\n`);
}
return;
}
const paragraph = toParagraph(transcript.map((e) => decodeHtmlEntities(e.text)), { keepBrackets });
if (!paragraph) die("empty transcript");
process.stdout.write(paragraph + "\n");
return;
} catch {
// Fallback below: use yt-dlp subtitles when direct transcript fails.
}
}
}
const { tmpDir, subtitlePath } = await ytDlpSubtitlesToTemp({
url,
lang,
extra,
});
try {
const raw = fs.readFileSync(subtitlePath, "utf8");
const segments = subtitlePath.endsWith(".srt") ? parseSrt(raw) : parseVtt(raw);
if (timestamps) {
// Subtitle timestamps are inconsistent across sites; keep output stable here.
const paragraph = toParagraph(segments, { keepBrackets });
process.stdout.write(paragraph + "\n");
return;
}
const paragraph = toParagraph(segments, { keepBrackets });
if (!paragraph) die("empty transcript from subtitles");
process.stdout.write(paragraph + "\n");
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}
async function cmdSubs({ url, lang, outputDir, extra }) {
if (!url) die("missing --url");
const { tmpDir, subtitlePath } = await ytDlpSubtitlesToTemp({
url,
lang,
extra,
});
try {
const out = path.resolve(outputDir);
fs.mkdirSync(out, { recursive: true });
const dest = path.join(out, path.basename(subtitlePath));
fs.copyFileSync(subtitlePath, dest);
process.stdout.write(dest + "\n");
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}
async function cmdDownload({ url, outputDir, extra }) {
if (!url) die("missing --url");
const ytdlp = resolveBin("yt-dlp", "/opt/homebrew/bin/yt-dlp");
if (!ytdlp) die("missing yt-dlp; install `yt-dlp` and ensure it is on PATH");
const out = path.resolve(outputDir);
fs.mkdirSync(out, { recursive: true });
const args = [];
// `--print after_move:filepath` gives the final path after merges/remux.
args.push("-P", out, "-o", "%(title).200B (%(id)s).%(ext)s", "-S", "res,ext:mp4:m4a,tbr", "--print", "after_move:filepath");
if (extra?.length) args.push(...extra);
args.push(url);
const r = await run(ytdlp, args);
if (r.code !== 0) die(r.out.trim() || "yt-dlp download failed");
const lines = r.out.split("\n").map((l) => l.trim());
const filePath = lines.find((l) => l.startsWith("/") && fs.existsSync(l));
if (!filePath) die(r.out.trim() || "could not determine downloaded file path");
process.stdout.write(path.resolve(filePath) + "\n");
}
async function cmdAudio({ url, outputDir, extra }) {
if (!url) die("missing --url");
const ytdlp = resolveBin("yt-dlp", "/opt/homebrew/bin/yt-dlp");
if (!ytdlp) die("missing yt-dlp; install `yt-dlp` and ensure it is on PATH");
const ffmpeg = resolveBin("ffmpeg", "/opt/homebrew/bin/ffmpeg");
if (!ffmpeg) die("missing ffmpeg; install `ffmpeg` (needed for audio extraction)");
const out = path.resolve(outputDir);
fs.mkdirSync(out, { recursive: true });
const args = [];
args.push(
"--ffmpeg-location",
ffmpeg,
"-P",
out,
"-o",
"%(title).200B (%(id)s).%(ext)s",
"-x",
"--audio-format",
"mp3",
"--print",
"after_move:filepath",
);
if (extra?.length) args.push(...extra);
args.push(url);
const r = await run(ytdlp, args);
if (r.code !== 0) die(r.out.trim() || "yt-dlp audio failed");
const lines = r.out.split("\n").map((l) => l.trim());
const filePath = lines.find((l) => l.startsWith("/") && fs.existsSync(l));
if (!filePath) die(r.out.trim() || "could not determine downloaded file path");
process.stdout.write(path.resolve(filePath) + "\n");
}
async function cmdFormats({ url, extra }) {
if (!url) die("missing --url");
const ytdlp = resolveBin("yt-dlp", "/opt/homebrew/bin/yt-dlp");
if (!ytdlp) die("missing yt-dlp; install `yt-dlp` and ensure it is on PATH");
// Print raw yt-dlp format table; user picks `--format <id>` for downloads.
const args = ["-F"];
if (extra?.length) args.push(...extra);
args.push(url);
const r = await run(ytdlp, args);
if (r.code !== 0) die(r.out.trim() || "yt-dlp formats failed");
process.stdout.write(r.out);
}
function usage() {
const rel = path.relative(process.cwd(), path.join(__dirname, "vtd.js"));
return [
"usage:",
` ${rel} transcript --url 'https://…' [--lang en] [--timestamps] [--keep-brackets] [-- <yt-dlp extra…>]`,
` ${rel} download --url 'https://…' [--output-dir ~/Downloads] [-- <yt-dlp extra…>]`,
` ${rel} audio --url 'https://…' [--output-dir ~/Downloads] [-- <yt-dlp extra…>]`,
` ${rel} subs --url 'https://…' [--output-dir ~/Downloads] [--lang en] [-- <yt-dlp extra…>]`,
` ${rel} formats --url 'https://…' [-- <yt-dlp extra…>]`,
].join("\n");
}
async function main() {
const { positional, opts } = parseArgs(process.argv.slice(2));
const cmd = positional[0];
if (!cmd || cmd === "help" || cmd === "-h" || cmd === "--help") {
process.stdout.write(usage() + "\n");
return;
}
const url = opts.url;
const lang = opts.lang || "en";
const outputDir = opts["output-dir"] || path.join(os.homedir(), "Downloads");
const timestamps = Boolean(opts.timestamps);
const keepBrackets = Boolean(opts["keep-brackets"]);
const extra = opts.extra || [];
if (cmd === "transcript") {
await cmdTranscript({ url, lang, timestamps, keepBrackets, extra });
return;
}
if (cmd === "download") {
await cmdDownload({ url, outputDir, extra });
return;
}
if (cmd === "audio") {
await cmdAudio({ url, outputDir, extra });
return;
}
if (cmd === "subs") {
await cmdSubs({ url, lang, outputDir, extra });
return;
}
if (cmd === "formats") {
await cmdFormats({ url, extra });
return;
}
die(`unknown command: ${cmd}\n\n${usage()}`);
}
main().catch((e) => die(e?.stack || e?.message || String(e)));