|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import { spawn, execSync } from 'child_process';
|
| import fs from 'fs';
|
| import path from 'path';
|
| import https from 'https';
|
| import http from 'http';
|
| import os from 'os';
|
| import type { ToolRegistry, ToolParams, ToolResult, ProgressEmitter } from '../toolRegistry';
|
|
|
| const OUTPUT_DIR = path.join(__dirname, '..', '..', 'output');
|
|
|
| export function register(registry: ToolRegistry): void {
|
| registry.register({
|
| name: 'transcript_tool',
|
| description: 'Extract transcript/subtitles from a YouTube video URL.',
|
| syntax: 'use <transcript_tool> <youtube-url>',
|
| pattern: /use\s+<transcript_tool>\s+(?<url>https?:\/\/(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)[\w-]+[^\s]*)/i,
|
| mock: false,
|
|
|
| async execute(params: ToolParams, emitProgress: ProgressEmitter): Promise<ToolResult> {
|
| const url = params.url || (params.captures && params.captures[0]);
|
| if (!url) throw new Error('No URL provided.');
|
|
|
| emitProgress('Extracting video ID...');
|
| const videoId = extractVideoId(url as string);
|
| if (!videoId) throw new Error('Invalid YouTube URL.');
|
|
|
| if (!fs.existsSync(OUTPUT_DIR)) {
|
| fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
| }
|
|
|
| emitProgress(`Video ID: ${videoId}`);
|
|
|
|
|
| const ytdlpPath = await findYtdlp(emitProgress);
|
| if (ytdlpPath) {
|
| emitProgress(`yt-dlp found at: ${ytdlpPath}`);
|
| try {
|
| const result = await fetchWithYtdlp(ytdlpPath, url as string, videoId, emitProgress);
|
| if (result.transcript && (result.transcript as string).trim().length > 0) {
|
| return result;
|
| }
|
| emitProgress('yt-dlp returned empty subtitles. Trying fallback...');
|
| } catch (err: any) {
|
| emitProgress(`yt-dlp failed: ${err.message}. Trying fallback...`);
|
| }
|
| } else {
|
| emitProgress('yt-dlp not found. Using YouTube API fallback...');
|
| }
|
|
|
|
|
| try {
|
| const result = await fetchWithInnertube(videoId, emitProgress);
|
| if (result.transcript && (result.transcript as string).trim().length > 0) {
|
| return result;
|
| }
|
| emitProgress('Innertube returned empty. Trying page scrape...');
|
| } catch (err: any) {
|
| emitProgress(`Innertube failed: ${err.message}. Trying page scrape...`);
|
| }
|
|
|
|
|
| try {
|
| const result = await fetchFromPage(videoId, emitProgress);
|
| if (result.transcript && (result.transcript as string).trim().length > 0) {
|
| return result;
|
| }
|
| } catch (err: any) {
|
| emitProgress(`Page scrape failed: ${err.message}`);
|
| }
|
|
|
| throw new Error('Could not extract transcript. The video may not have captions, or YouTube blocked the request. Install yt-dlp for best results: pip install yt-dlp');
|
| },
|
| });
|
| }
|
|
|
| function extractVideoId(url: string): string | null {
|
| const patterns = [/[?&]v=([\w-]{11})/, /youtu\.be\/([\w-]{11})/, /embed\/([\w-]{11})/];
|
| for (const p of patterns) {
|
| const m = url.match(p);
|
| if (m) return m[1];
|
| }
|
| return null;
|
| }
|
|
|
|
|
| async function findYtdlp(emitProgress: ProgressEmitter): Promise<string | null> {
|
| const isWin = process.platform === 'win32';
|
| const exe = isWin ? 'yt-dlp.exe' : 'yt-dlp';
|
|
|
|
|
| const onPath = await checkCommand('yt-dlp');
|
| if (onPath) return 'yt-dlp';
|
|
|
| emitProgress('yt-dlp not on PATH. Searching Python Scripts directories...');
|
|
|
|
|
| const home = os.homedir();
|
| const candidateDirs: string[] = [];
|
|
|
| if (isWin) {
|
|
|
| candidateDirs.push(
|
| path.join(home, 'AppData', 'Local', 'Programs', 'Python', 'Python313', 'Scripts'),
|
| path.join(home, 'AppData', 'Local', 'Programs', 'Python', 'Python312', 'Scripts'),
|
| path.join(home, 'AppData', 'Local', 'Programs', 'Python', 'Python311', 'Scripts'),
|
| path.join(home, 'AppData', 'Local', 'Programs', 'Python', 'Python310', 'Scripts'),
|
| path.join(home, 'AppData', 'Roaming', 'Python', 'Python313', 'Scripts'),
|
| path.join(home, 'AppData', 'Roaming', 'Python', 'Python312', 'Scripts'),
|
| path.join(home, 'AppData', 'Roaming', 'Python', 'Python311', 'Scripts'),
|
| );
|
|
|
|
|
| try {
|
| const packagesDir = path.join(home, 'AppData', 'Local', 'Packages');
|
| if (fs.existsSync(packagesDir)) {
|
| const entries = fs.readdirSync(packagesDir);
|
| for (const entry of entries) {
|
| if (entry.startsWith('PythonSoftwareFoundation.Python')) {
|
|
|
| const localCache = path.join(packagesDir, entry, 'LocalCache', 'local-packages');
|
| if (fs.existsSync(localCache)) {
|
| const pyDirs = fs.readdirSync(localCache).filter(d => d.startsWith('Python'));
|
| for (const pyDir of pyDirs) {
|
| candidateDirs.push(path.join(localCache, pyDir, 'Scripts'));
|
| }
|
| }
|
| }
|
| }
|
| }
|
| } catch { }
|
|
|
|
|
| try {
|
| const pipOutput = execSync('pip show yt-dlp 2>nul', { encoding: 'utf-8', timeout: 5000 });
|
| const locMatch = pipOutput.match(/Location:\s*(.+)/i);
|
| if (locMatch) {
|
| const sitePackages = locMatch[1].trim();
|
|
|
| const scriptsDir = path.join(path.dirname(sitePackages), 'Scripts');
|
| candidateDirs.unshift(scriptsDir);
|
| }
|
| } catch { }
|
|
|
|
|
| try {
|
| const pipOutput = execSync('python -m pip show yt-dlp 2>nul', { encoding: 'utf-8', timeout: 5000 });
|
| const locMatch = pipOutput.match(/Location:\s*(.+)/i);
|
| if (locMatch) {
|
| const sitePackages = locMatch[1].trim();
|
| const scriptsDir = path.join(path.dirname(sitePackages), 'Scripts');
|
| candidateDirs.unshift(scriptsDir);
|
| }
|
| } catch { }
|
| } else {
|
|
|
| candidateDirs.push(
|
| path.join(home, '.local', 'bin'),
|
| '/usr/local/bin',
|
| '/usr/bin',
|
| );
|
| }
|
|
|
|
|
| for (const dir of candidateDirs) {
|
| const fullPath = path.join(dir, exe);
|
| if (fs.existsSync(fullPath)) {
|
| emitProgress(`Found yt-dlp at: ${fullPath}`);
|
|
|
| const works = await checkCommand(`"${fullPath}"`);
|
| if (works) return `"${fullPath}"`;
|
| }
|
| }
|
|
|
|
|
| const pyModule = await checkCommand('python -m yt_dlp');
|
| if (pyModule) {
|
| emitProgress('Found yt-dlp as Python module.');
|
| return 'python -m yt_dlp';
|
| }
|
|
|
| return null;
|
| }
|
|
|
| function checkCommand(cmd: string): Promise<boolean> {
|
| return new Promise((resolve) => {
|
| const proc = spawn(cmd, ['--version'], { shell: true });
|
| let resolved = false;
|
| const timeout = setTimeout(() => { if (!resolved) { resolved = true; resolve(false); try { proc.kill(); } catch { } } }, 5000);
|
| proc.on('close', (code) => { if (!resolved) { resolved = true; clearTimeout(timeout); resolve(code === 0); } });
|
| proc.on('error', () => { if (!resolved) { resolved = true; clearTimeout(timeout); resolve(false); } });
|
| });
|
| }
|
|
|
|
|
| function fetchWithYtdlp(ytdlpCmd: string, url: string, videoId: string, emitProgress: ProgressEmitter): Promise<ToolResult> {
|
| return new Promise((resolve, reject) => {
|
| const outTemplate = path.join(OUTPUT_DIR, videoId);
|
|
|
|
|
| const cmdLine = `${ytdlpCmd} --write-auto-sub --write-sub --sub-lang en,en-US,en-GB --skip-download --sub-format vtt/srt/best -o "${outTemplate}" "${url}"`;
|
|
|
| const child = spawn(cmdLine, [], { shell: true });
|
| let stderr = '';
|
|
|
| child.stdout?.on('data', (chunk: Buffer) => emitProgress(chunk.toString().trim()));
|
| child.stderr?.on('data', (chunk: Buffer) => { stderr += chunk.toString(); });
|
|
|
| child.on('close', (code) => {
|
| if (code !== 0) return reject(new Error(`yt-dlp exited ${code}: ${stderr}`));
|
|
|
|
|
| const files = fs.readdirSync(OUTPUT_DIR).filter((f) =>
|
| f.startsWith(videoId) && (f.endsWith('.vtt') || f.endsWith('.srt'))
|
| );
|
| if (!files.length) return reject(new Error('No subtitle file generated.'));
|
|
|
| const subContent = fs.readFileSync(path.join(OUTPUT_DIR, files[0]), 'utf-8');
|
| const text = files[0].endsWith('.srt') ? parseSrt(subContent) : parseVtt(subContent);
|
| const fname = `${videoId}-transcript.txt`;
|
| fs.writeFileSync(path.join(OUTPUT_DIR, fname), text, 'utf-8');
|
|
|
| emitProgress(`Transcript saved: ${fname} (${text.length} chars)`);
|
| resolve({ transcript: text, downloadUrl: `/api/download/${fname}`, filename: fname, method: 'yt-dlp' });
|
| });
|
|
|
| child.on('error', reject);
|
| });
|
| }
|
|
|
|
|
| async function fetchWithInnertube(videoId: string, emitProgress: ProgressEmitter): Promise<ToolResult> {
|
| emitProgress('Fetching via YouTube Innertube API...');
|
|
|
| const body = JSON.stringify({
|
| context: {
|
| client: {
|
| clientName: 'WEB',
|
| clientVersion: '2.20240101.00.00',
|
| hl: 'en',
|
| gl: 'US',
|
| },
|
| },
|
| videoId: videoId,
|
| });
|
|
|
| const responseText = await httpPost(
|
| 'https://www.youtube.com/youtubei/v1/get_transcript?prettyPrint=false',
|
| body,
|
| { 'Content-Type': 'application/json' }
|
| );
|
|
|
|
|
| const lines: string[] = [];
|
| try {
|
| const data = JSON.parse(responseText);
|
| const actions = data?.actions;
|
| if (actions) {
|
| for (const action of actions) {
|
| const segments = action?.updateEngagementPanelAction?.content?.transcriptRenderer
|
| ?.body?.transcriptBodyRenderer?.cueGroups;
|
| if (segments) {
|
| for (const seg of segments) {
|
| const cues = seg?.transcriptCueGroupRenderer?.cues;
|
| if (cues) {
|
| for (const cue of cues) {
|
| const text = cue?.transcriptCueRenderer?.cue?.simpleText;
|
| if (text) lines.push(text.trim());
|
| }
|
| }
|
| }
|
| }
|
| }
|
| }
|
| } catch {
|
|
|
| }
|
|
|
| if (lines.length === 0) {
|
| throw new Error('Innertube returned no transcript data.');
|
| }
|
|
|
| const text = lines.join('\n');
|
| const fname = `${videoId}-transcript.txt`;
|
| fs.writeFileSync(path.join(OUTPUT_DIR, fname), text, 'utf-8');
|
| emitProgress(`Transcript saved: ${fname} (${text.length} chars, ${lines.length} lines)`);
|
|
|
| return { transcript: text, downloadUrl: `/api/download/${fname}`, filename: fname, method: 'innertube' };
|
| }
|
|
|
|
|
| async function fetchFromPage(videoId: string, emitProgress: ProgressEmitter): Promise<ToolResult> {
|
| emitProgress('Fetching YouTube page for caption tracks...');
|
|
|
| const html = await httpGet(`https://www.youtube.com/watch?v=${videoId}`, {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 'Accept-Language': 'en-US,en;q=0.9',
|
| 'Cookie': 'CONSENT=YES+cb.20210328-17-p0.en+FX+999',
|
| });
|
|
|
| if (!html || html.length < 1000) {
|
| throw new Error('YouTube returned empty or blocked page.');
|
| }
|
|
|
|
|
| const patterns = [
|
| /"captionTracks"\s*:\s*(\[.*?\])/s,
|
| /captionTracks.*?(\[.*?\])/s,
|
| /"playerCaptionsTracklistRenderer"\s*:\s*\{.*?"captionTracks"\s*:\s*(\[.*?\])/s,
|
| ];
|
|
|
| let tracks: any[] | null = null;
|
| for (const pattern of patterns) {
|
| const m = html.match(pattern);
|
| if (m) {
|
| try {
|
| tracks = JSON.parse(m[1]);
|
| break;
|
| } catch {
|
| continue;
|
| }
|
| }
|
| }
|
|
|
| if (!tracks || tracks.length === 0) {
|
| throw new Error('No caption tracks found in page HTML.');
|
| }
|
|
|
|
|
| const enTrack =
|
| tracks.find((t: any) => t.languageCode === 'en' && !t.kind) ||
|
| tracks.find((t: any) => t.languageCode === 'en') ||
|
| tracks.find((t: any) => t.languageCode?.startsWith('en')) ||
|
| tracks[0];
|
|
|
| if (!enTrack?.baseUrl) {
|
| throw new Error('No usable caption track URL.');
|
| }
|
|
|
| emitProgress(`Found captions: ${enTrack.name?.simpleText || enTrack.languageCode} (${enTrack.kind || 'manual'})`);
|
|
|
|
|
| let text = '';
|
| try {
|
| const json3Url = enTrack.baseUrl + (enTrack.baseUrl.includes('?') ? '&' : '?') + 'fmt=json3';
|
| const json3Response = await httpGet(json3Url, { 'User-Agent': 'Mozilla/5.0' });
|
| text = parseJson3Captions(json3Response);
|
| } catch {
|
|
|
| }
|
|
|
| if (!text) {
|
| const xmlResponse = await httpGet(enTrack.baseUrl, { 'User-Agent': 'Mozilla/5.0' });
|
| text = parseXmlCaptions(xmlResponse);
|
| }
|
|
|
| if (!text.trim()) {
|
| throw new Error('Caption content is empty after parsing.');
|
| }
|
|
|
| const fname = `${videoId}-transcript.txt`;
|
| fs.writeFileSync(path.join(OUTPUT_DIR, fname), text, 'utf-8');
|
| emitProgress(`Transcript saved: ${fname} (${text.length} chars)`);
|
|
|
| return { transcript: text, downloadUrl: `/api/download/${fname}`, filename: fname, method: 'page-scrape' };
|
| }
|
|
|
|
|
| function httpGet(url: string, headers: Record<string, string> = {}): Promise<string> {
|
| return new Promise((resolve, reject) => {
|
| const client = url.startsWith('https') ? https : http;
|
| const parsed = new URL(url);
|
| const opts = {
|
| hostname: parsed.hostname,
|
| port: parsed.port,
|
| path: parsed.pathname + parsed.search,
|
| method: 'GET',
|
| headers: {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| ...headers,
|
| },
|
| };
|
|
|
| const req = client.request(opts, (res) => {
|
| if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
| return httpGet(res.headers.location, headers).then(resolve).catch(reject);
|
| }
|
| let data = '';
|
| res.on('data', (c: Buffer) => { data += c; });
|
| res.on('end', () => resolve(data));
|
| res.on('error', reject);
|
| });
|
| req.on('error', reject);
|
| req.end();
|
| });
|
| }
|
|
|
| function httpPost(url: string, body: string, headers: Record<string, string> = {}): Promise<string> {
|
| return new Promise((resolve, reject) => {
|
| const parsed = new URL(url);
|
| const opts = {
|
| hostname: parsed.hostname,
|
| port: parsed.port || 443,
|
| path: parsed.pathname + parsed.search,
|
| method: 'POST',
|
| headers: {
|
| 'Content-Type': 'application/json',
|
| 'Content-Length': Buffer.byteLength(body),
|
| 'User-Agent': 'Mozilla/5.0',
|
| ...headers,
|
| },
|
| };
|
|
|
| const req = https.request(opts, (res) => {
|
| let data = '';
|
| res.on('data', (c: Buffer) => { data += c; });
|
| res.on('end', () => resolve(data));
|
| res.on('error', reject);
|
| });
|
| req.on('error', reject);
|
| req.write(body);
|
| req.end();
|
| });
|
| }
|
|
|
|
|
| function parseVtt(vtt: string): string {
|
| const seen = new Set<string>();
|
| return vtt.split('\n')
|
| .map((l) => l.trim())
|
| .filter((l) => l && l !== 'WEBVTT' && !l.includes('-->') && !/^\d+$/.test(l) && !l.startsWith('Kind:') && !l.startsWith('Language:') && !l.startsWith('NOTE'))
|
| .map((l) => l.replace(/<[^>]+>/g, '').replace(/ /g, ' ').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').trim())
|
| .filter((l) => { if (l && !seen.has(l)) { seen.add(l); return true; } return false; })
|
| .join('\n');
|
| }
|
|
|
| function parseSrt(srt: string): string {
|
| const seen = new Set<string>();
|
| return srt.split('\n')
|
| .map((l) => l.trim())
|
| .filter((l) => l && !l.includes('-->') && !/^\d+$/.test(l))
|
| .map((l) => l.replace(/<[^>]+>/g, '').trim())
|
| .filter((l) => { if (l && !seen.has(l)) { seen.add(l); return true; } return false; })
|
| .join('\n');
|
| }
|
|
|
| function parseXmlCaptions(xml: string): string {
|
| const lines: string[] = [];
|
| const re = /<text[^>]*>([\s\S]*?)<\/text>/g;
|
| let m: RegExpExecArray | null;
|
| while ((m = re.exec(xml)) !== null) {
|
| const t = decodeEntities(m[1]).replace(/<[^>]+>/g, '').trim();
|
| if (t) lines.push(t);
|
| }
|
| return lines.join('\n');
|
| }
|
|
|
| function parseJson3Captions(json: string): string {
|
| try {
|
| const data = JSON.parse(json);
|
| const events = data?.events;
|
| if (!events) return '';
|
|
|
| const lines: string[] = [];
|
| for (const event of events) {
|
| if (event.segs) {
|
| const text = event.segs.map((s: any) => s.utf8 || '').join('').trim();
|
| if (text && text !== '\n') lines.push(text);
|
| }
|
| }
|
| return lines.join('\n');
|
| } catch {
|
| return '';
|
| }
|
| }
|
|
|
| function decodeEntities(str: string): string {
|
| return str
|
| .replace(/&/g, '&')
|
| .replace(/</g, '<')
|
| .replace(/>/g, '>')
|
| .replace(/"/g, '"')
|
| .replace(/'/g, "'")
|
| .replace(/'/g, "'")
|
| .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
|
| }
|
|
|