Spaces:
Running
on
Zero
Running
on
Zero
| <html lang="en"> | |
| <meta charset="UTF-8" /> | |
| <title>VibeVoice-Realtime TTS Demo</title> | |
| <style> | |
| :root { | |
| --bg: #f5f7fc; | |
| --surface: #ffffff; | |
| --accent: #5562ff; | |
| --accent-strong: #3f4dff; | |
| --text-primary: #1f2742; | |
| --text-muted: #5d6789; | |
| --border: rgba(85, 98, 255, 0.18); | |
| --shadow: 0 18px 45px rgba(31, 39, 66, 0.08); | |
| } | |
| .helper-text { | |
| font-size: 12px; | |
| color: #8a93b5; | |
| } | |
| * { | |
| box-sizing: border-box; | |
| } | |
| body { | |
| margin: 0; | |
| background: var(--bg); | |
| font-family: 'Inter', 'Segoe UI', Roboto, Helvetica, sans-serif; | |
| color: var(--text-primary); | |
| display: flex; | |
| justify-content: center; | |
| padding: 48px 20px; | |
| } | |
| .app-shell { | |
| width: min(960px, 100%); | |
| background: var(--surface); | |
| border-radius: 20px; | |
| padding: 36px 40px 44px; | |
| box-shadow: var(--shadow); | |
| display: flex; | |
| flex-direction: column; | |
| gap: 28px; | |
| } | |
| h1 { | |
| margin: 0; | |
| text-align: center; | |
| font-size: 30px; | |
| font-weight: 700; | |
| letter-spacing: 0.01em; | |
| } | |
| .panel { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 10px; | |
| } | |
| .field { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 8px; | |
| } | |
| .field-label { | |
| font-weight: 600; | |
| font-size: 15px; | |
| color: var(--text-primary); | |
| } | |
| .text-input { | |
| width: 100%; | |
| min-height: 140px; | |
| max-height: 240px; | |
| border: 1px solid rgba(31, 39, 66, 0.14); | |
| border-radius: 12px; | |
| padding: 14px 16px; | |
| font-size: 15px; | |
| line-height: 1.6; | |
| font-family: inherit; | |
| background: #f9faff; | |
| transition: border-color 0.2s, box-shadow 0.2s; | |
| resize: vertical; | |
| } | |
| .text-input:focus { | |
| outline: none; | |
| border-color: var(--accent); | |
| box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18); | |
| background: #fff; | |
| } | |
| #streamingPreviewContainer { | |
| border-radius: 14px; | |
| border: 1px solid var(--border); | |
| background: linear-gradient(135deg, #eef2ff 0%, #f7f9ff 100%); | |
| padding: 18px 20px; | |
| box-shadow: inset 0 1px 2px rgba(85, 98, 255, 0.12); | |
| } | |
| #streamingPreviewHeader { | |
| font-weight: 600; | |
| color: var(--text-primary); | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| font-size: 14px; | |
| margin-bottom: 8px; | |
| } | |
| #streamingPreviewNote { | |
| font-weight: 400; | |
| font-size: 12px; | |
| color: var(--text-muted); | |
| } | |
| #streamingPreview { | |
| min-height: 70px; | |
| padding: 10px 12px; | |
| border-radius: 10px; | |
| background: rgba(255, 255, 255, 0.9); | |
| border: 1px solid rgba(85, 98, 255, 0.25); | |
| font-family: 'Courier New', Courier, monospace; | |
| font-size: 14px; | |
| line-height: 1.5; | |
| color: var(--text-primary); | |
| white-space: pre-wrap; | |
| } | |
| #streamingPreview.streaming-active::after { | |
| content: ""; | |
| display: inline-block; | |
| width: 2px; | |
| height: 1.1em; | |
| background: var(--accent); | |
| margin-left: 2px; | |
| animation: previewCaret 0.9s steps(1) infinite; | |
| vertical-align: bottom; | |
| } | |
| @keyframes previewCaret { | |
| 0%, 50% { | |
| opacity: 1; | |
| } | |
| 51%, 100% { | |
| opacity: 0; | |
| } | |
| } | |
| .control-panel { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 18px; | |
| } | |
| .inline-field { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 6px; | |
| } | |
| .select-control { | |
| width: 220px; | |
| border: 1px solid rgba(31, 39, 66, 0.14); | |
| border-radius: 10px; | |
| padding: 8px 12px; | |
| font-size: 14px; | |
| font-family: inherit; | |
| background: #fbfcff; | |
| color: var(--text-primary); | |
| transition: border-color 0.2s, box-shadow 0.2s; | |
| } | |
| .select-control:focus { | |
| outline: none; | |
| border-color: var(--accent); | |
| box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18); | |
| background: #fff; | |
| } | |
| .control-row { | |
| display: flex; | |
| align-items: center; | |
| flex-wrap: wrap; | |
| gap: 20px 28px; | |
| } | |
| .range-control { | |
| display: flex; | |
| align-items: center; | |
| gap: 12px; | |
| font-size: 14px; | |
| color: var(--text-primary); | |
| } | |
| .range-control input[type="range"] { | |
| width: 200px; | |
| accent-color: var(--accent); | |
| } | |
| .range-value { | |
| font-weight: 600; | |
| color: var(--text-primary); | |
| min-width: 42px; | |
| text-align: right; | |
| } | |
| #playback { | |
| background: var(--accent); | |
| color: #fff; | |
| border: none; | |
| padding: 10px 24px; | |
| border-radius: 999px; | |
| cursor: pointer; | |
| font-weight: 600; | |
| font-size: 14px; | |
| box-shadow: 0 8px 16px rgba(85, 98, 255, 0.25); | |
| transition: transform 0.15s, box-shadow 0.15s, background 0.15s; | |
| } | |
| #playback:hover { | |
| transform: translateY(-1px); | |
| box-shadow: 0 10px 20px rgba(85, 98, 255, 0.28); | |
| } | |
| #playback:active { | |
| transform: translateY(0); | |
| } | |
| #playback.playing { | |
| background: var(--accent-strong); | |
| } | |
| .secondary-btn { | |
| border: 1px solid rgba(31, 39, 66, 0.18); | |
| background: #f1f3ff; | |
| color: var(--text-primary); | |
| padding: 8px 18px; | |
| border-radius: 999px; | |
| cursor: pointer; | |
| font-size: 13px; | |
| font-weight: 500; | |
| transition: background 0.15s, border-color 0.15s; | |
| } | |
| .secondary-btn:hover { | |
| background: #e6e9ff; | |
| border-color: rgba(31, 39, 66, 0.26); | |
| } | |
| .secondary-btn:disabled { | |
| opacity: 0.55; | |
| cursor: not-allowed; | |
| } | |
| .metrics { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 16px 32px; | |
| font-size: 14px; | |
| color: var(--text-muted); | |
| } | |
| .metrics span { | |
| display: flex; | |
| align-items: baseline; | |
| gap: 6px; | |
| } | |
| .metrics span strong { | |
| color: var(--text-primary); | |
| font-weight: 600; | |
| } | |
| .metric-unit { | |
| color: var(--text-muted); | |
| font-size: 13px; | |
| } | |
| #logOutput { | |
| max-height: 260px; | |
| overflow-y: auto; | |
| background: #f7f9ff; | |
| color: var(--text-primary); | |
| padding: 16px 18px; | |
| border: 1px solid rgba(31, 39, 66, 0.12); | |
| border-radius: 12px; | |
| font-size: 13px; | |
| line-height: 1.6; | |
| box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.06); | |
| font-family: 'Fira Code', 'Courier New', Courier, monospace; | |
| margin-top: 0px; | |
| } | |
| @media (max-width: 720px) { | |
| .app-shell { | |
| padding: 28px 20px 36px; | |
| gap: 24px; | |
| } | |
| .select-control { | |
| width: 100%; | |
| } | |
| .control-row { | |
| flex-direction: column; | |
| align-items: flex-start; | |
| gap: 16px; | |
| } | |
| #playback { | |
| width: 100%; | |
| text-align: center; | |
| } | |
| } | |
| </style> | |
| <body> | |
| <div class="app-shell"> | |
| <h1>VibeVoice-Realtime TTS Demo</h1> | |
| <section class="panel"> | |
| <label class="field"> | |
| <span class="field-label">Text</span> | |
| <textarea | |
| id="prompt" | |
| class="text-input" | |
| rows="4" | |
| >Enter your text here and click "Start" to instantly hear the VibeVoice-Realtime TTS output audio.</textarea> | |
| </label> | |
| <div id="streamingPreviewContainer"> | |
| <div id="streamingPreviewHeader"> | |
| <span>Streaming Input Text</span> | |
| </div> | |
| <div id="streamingPreview" aria-live="polite">This area will display the streaming input text in real time.</div> | |
| </div> | |
| </section> | |
| <span class="helper-text">This demo requires the full text to be provided upfront. The model then receives the text via streaming input during synthesis.<br> | |
| For non-punctuation special characters, applying text normalization before processing often yields better results.</span> | |
| <section class="panel control-panel"> | |
| <div class="inline-field"> | |
| <span class="field-label">Speaker</span> | |
| <select id="voiceSelect" class="select-control"> | |
| <option value="">Loading...</option> | |
| </select> | |
| </div> | |
| <div class="control-row"> | |
| <label class="range-control"> | |
| <span>CFG</span> | |
| <input id="cfgScale" type="range" min="1" max="3" step="0.05" value="1.5" /> | |
| <span class="range-value" id="cfgValue">1.5</span> | |
| </label> | |
| <label class="range-control"> | |
| <span>Inference Steps</span> | |
| <input id="inferenceSteps" type="range" min="1" max="20" step="1" value="5" /> | |
| <span class="range-value" id="stepsValue">5</span> | |
| </label> | |
| <button id="resetControls" type="button" class="secondary-btn">Reset Controls</button> | |
| </div> | |
| <div class="control-row"> | |
| <button id="playback">Start</button> | |
| <button id="saveAudio" type="button" class="secondary-btn" disabled>Save</button> | |
| </div> | |
| </section> | |
| <section class="panel"> | |
| <div class="metrics"> | |
| <span>Model Generated Audio<strong id="modelGenerated">0.00</strong><span class="metric-unit">s</span></span> | |
| <span>Audio Played<strong id="playbackElapsed">0.00</strong><span class="metric-unit">s</span></span> | |
| </div> | |
| </section> | |
| <section class="panel"> | |
| <span class="field-label">Runtime Logs</span> | |
| <pre id="logOutput"></pre> | |
| </section> | |
| </div> | |
| <script> | |
| (() => { | |
| const SAMPLE_RATE = 24_000; | |
| const BUFFER_SIZE = 2048; | |
| const PREBUFFER_SEC = 0.1; | |
| let audioCtx = null; | |
| let scriptNode = null; | |
| let socket = null; | |
| let buffer = new Float32Array(0); | |
| let isPlaying = false; | |
| let hasStartedPlayback = false; | |
| let silentFrameCount = 0; | |
| const promptInput = document.getElementById('prompt'); | |
| const streamingPreview = document.getElementById('streamingPreview'); | |
| const controlBtn = document.getElementById('playback'); | |
| const cfgSelect = document.getElementById('cfgScale'); | |
| const stepsSelect = document.getElementById('inferenceSteps'); | |
| const voiceSelect = document.getElementById('voiceSelect'); | |
| const cfgValueLabel = document.getElementById('cfgValue'); | |
| const stepsValueLabel = document.getElementById('stepsValue'); | |
| const modelGeneratedLabel = document.getElementById('modelGenerated'); | |
| const playbackElapsedLabel = document.getElementById('playbackElapsed'); | |
| const logOutput = document.getElementById('logOutput'); | |
| const resetBtn = document.getElementById('resetControls'); | |
| const saveBtn = document.getElementById('saveAudio'); | |
| let playbackTimer = null; | |
| let lastPlaybackElapsed = 0; | |
| let playbackSamples = 0; | |
| let modelGeneratedTotal = 0; | |
| let firstBrowserChunkLogged = false; | |
| let playbackStartedLogged = false; | |
| const logEntries = []; | |
| let logSequence = 0; | |
| let recordedChunks = []; | |
| let recordedSamples = 0; | |
| let recordingComplete = false; | |
| let downloadUrl = null; | |
| const revokeDownloadUrl = () => { | |
| if (downloadUrl) { | |
| URL.revokeObjectURL(downloadUrl); | |
| downloadUrl = null; | |
| } | |
| }; | |
| const updateSaveButtonState = () => { | |
| if (!saveBtn) { | |
| return; | |
| } | |
| saveBtn.disabled = recordedSamples === 0 || !recordingComplete; | |
| }; | |
| const clearRecordedChunks = () => { | |
| recordedChunks = []; | |
| recordedSamples = 0; | |
| recordingComplete = false; | |
| revokeDownloadUrl(); | |
| updateSaveButtonState(); | |
| }; | |
| const createWavBlob = () => { | |
| if (!recordedSamples) { | |
| return null; | |
| } | |
| const wavBuffer = new ArrayBuffer(44 + recordedSamples * 2); | |
| const view = new DataView(wavBuffer); | |
| const writeString = (offset, str) => { | |
| for (let i = 0; i < str.length; i += 1) { | |
| view.setUint8(offset + i, str.charCodeAt(i)); | |
| } | |
| }; | |
| writeString(0, 'RIFF'); | |
| view.setUint32(4, 36 + recordedSamples * 2, true); | |
| writeString(8, 'WAVE'); | |
| writeString(12, 'fmt '); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); | |
| view.setUint16(22, 1, true); | |
| view.setUint32(24, SAMPLE_RATE, true); | |
| view.setUint32(28, SAMPLE_RATE * 2, true); | |
| view.setUint16(32, 2, true); | |
| view.setUint16(34, 16, true); | |
| writeString(36, 'data'); | |
| view.setUint32(40, recordedSamples * 2, true); | |
| const pcmData = new Int16Array(wavBuffer, 44, recordedSamples); | |
| let offset = 0; | |
| recordedChunks.forEach(chunk => { | |
| const chunkData = new Int16Array(chunk); | |
| pcmData.set(chunkData, offset); | |
| offset += chunkData.length; | |
| }); | |
| return new Blob([wavBuffer], { type: 'audio/wav' }); | |
| }; | |
| const updateCfgDisplay = () => { | |
| cfgValueLabel.textContent = Number(cfgSelect.value).toFixed(3); | |
| }; | |
| const updateStepsDisplay = () => { | |
| stepsValueLabel.textContent = Number(stepsSelect.value).toString(); | |
| }; | |
| cfgSelect.addEventListener('input', updateCfgDisplay); | |
| stepsSelect.addEventListener('input', updateStepsDisplay); | |
| updateCfgDisplay(); | |
| updateStepsDisplay(); | |
| const pad2 = value => value.toString().padStart(2, '0'); | |
| const pad3 = value => value.toString().padStart(3, '0'); | |
| const formatLocalTimestamp = () => { | |
| const d = new Date(); | |
| const year = d.getFullYear(); | |
| const month = pad2(d.getMonth() + 1); | |
| const day = pad2(d.getDate()); | |
| const hours = pad2(d.getHours()); | |
| const minutes = pad2(d.getMinutes()); | |
| const seconds = pad2(d.getSeconds()); | |
| const millis = pad3(d.getMilliseconds()); | |
| return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}.${millis}`; | |
| }; | |
| const formatSeconds = raw => { | |
| const value = Number(raw); | |
| return Number.isFinite(value) ? value.toFixed(2) : '0.00'; | |
| }; | |
| const parseTimestamp = value => { | |
| if (!value) { | |
| return new Date(); | |
| } | |
| if (/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}/.test(value)) { | |
| return new Date(value.replace(' ', 'T')); | |
| } | |
| return new Date(value); | |
| }; | |
| const setModelGenerated = value => { | |
| const numeric = Number(value); | |
| if (!Number.isFinite(numeric)) { | |
| return; | |
| } | |
| modelGeneratedTotal = Math.max(0, numeric); | |
| modelGeneratedLabel.textContent = formatSeconds(modelGeneratedTotal); | |
| }; | |
| const setPlaybackElapsed = value => { | |
| const capped = Math.min(modelGeneratedTotal, Math.max(0, value)); | |
| lastPlaybackElapsed = capped; | |
| playbackElapsedLabel.textContent = formatSeconds(lastPlaybackElapsed); | |
| }; | |
| const STREAMING_WPM = 180; | |
| const STREAMING_INTERVAL_MS = 60000 / STREAMING_WPM; | |
| let previewTimeoutId = null; | |
| let previewTokens = []; | |
| let previewIndex = 0; | |
| let previewActive = false; | |
| const clearPreviewTimer = () => { | |
| if (previewTimeoutId) { | |
| clearTimeout(previewTimeoutId); | |
| previewTimeoutId = null; | |
| } | |
| }; | |
| const setPreviewIdle = message => { | |
| if (!streamingPreview) { | |
| return; | |
| } | |
| streamingPreview.classList.remove('streaming-active'); | |
| streamingPreview.textContent = message; | |
| }; | |
| const schedulePreviewTick = () => { | |
| if (!streamingPreview) { | |
| return; | |
| } | |
| if (previewIndex >= previewTokens.length) { | |
| streamingPreview.classList.remove('streaming-active'); | |
| return; | |
| } | |
| streamingPreview.classList.add('streaming-active'); | |
| streamingPreview.textContent += previewTokens[previewIndex]; | |
| previewIndex += 1; | |
| previewTimeoutId = setTimeout(schedulePreviewTick, STREAMING_INTERVAL_MS); | |
| }; | |
| const updateStreamingPreview = () => { | |
| if (!streamingPreview) { | |
| return; | |
| } | |
| clearPreviewTimer(); | |
| previewIndex = 0; | |
| const source = (promptInput?.value || '').trimEnd(); | |
| streamingPreview.textContent = ''; | |
| previewTokens = source.match(/\S+\s*/g) || []; | |
| schedulePreviewTick(); | |
| }; | |
| const clearLogs = () => { | |
| if (logOutput) { | |
| logOutput.textContent = ''; | |
| } | |
| logEntries.length = 0; | |
| modelGeneratedTotal = 0; | |
| setModelGenerated(0); | |
| }; | |
| const appendLog = (message, timestamp) => { | |
| if (!logOutput) { | |
| return; | |
| } | |
| const finalTimestamp = timestamp || formatLocalTimestamp(); | |
| const entry = { | |
| timestamp: finalTimestamp, | |
| date: parseTimestamp(finalTimestamp), | |
| message, | |
| seq: logSequence += 1, | |
| }; | |
| logEntries.push(entry); | |
| logEntries.sort((a, b) => { | |
| const diff = a.date.getTime() - b.date.getTime(); | |
| return diff !== 0 ? diff : a.seq - b.seq; | |
| }); | |
| if (logEntries.length > 400) { | |
| logEntries.splice(0, logEntries.length - 400); | |
| } | |
| logOutput.textContent = logEntries | |
| .map(item => `[${item.timestamp}] ${item.message}`) | |
| .join('\n'); | |
| logOutput.scrollTop = logOutput.scrollHeight; | |
| }; | |
| const handleSaveClick = () => { | |
| if (!recordedSamples) { | |
| appendLog('[Frontend] Save requested but no audio received yet'); | |
| return; | |
| } | |
| const wavBlob = createWavBlob(); | |
| if (!wavBlob) { | |
| appendLog('[Error] Failed to assemble WAV data for download'); | |
| return; | |
| } | |
| revokeDownloadUrl(); | |
| downloadUrl = URL.createObjectURL(wavBlob); | |
| const link = document.createElement('a'); | |
| const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); | |
| link.href = downloadUrl; | |
| link.download = `vibevoice_realtime_audio_${timestamp}.wav`; | |
| document.body.appendChild(link); | |
| link.click(); | |
| document.body.removeChild(link); | |
| appendLog('[Frontend] Audio download triggered'); | |
| }; | |
| const stopPlaybackTimer = () => { | |
| if (playbackTimer) { | |
| clearInterval(playbackTimer); | |
| playbackTimer = null; | |
| } | |
| }; | |
| const startPlaybackTimer = () => { | |
| stopPlaybackTimer(); | |
| playbackTimer = setInterval(() => { | |
| setPlaybackElapsed(playbackSamples / SAMPLE_RATE); | |
| }, 250); | |
| }; | |
| const loadVoices = async () => { | |
| try { | |
| voiceSelect.disabled = true; | |
| const response = await fetch('/config'); | |
| if (!response.ok) { | |
| throw new Error(`Failed to fetch config: ${response.status}`); | |
| } | |
| const data = await response.json(); | |
| const voices = Array.isArray(data.voices) ? data.voices : []; | |
| voiceSelect.innerHTML = ''; | |
| if (voices.length === 0) { | |
| const option = document.createElement('option'); | |
| option.value = ''; | |
| option.textContent = 'No voices available'; | |
| voiceSelect.appendChild(option); | |
| voiceSelect.disabled = true; | |
| appendLog('[Error] No voice presets available'); | |
| return; | |
| } | |
| voices.forEach(voice => { | |
| const option = document.createElement('option'); | |
| option.value = voice; | |
| option.textContent = voice; | |
| voiceSelect.appendChild(option); | |
| }); | |
| if (data.default_voice && voices.includes(data.default_voice)) { | |
| voiceSelect.value = data.default_voice; | |
| } | |
| voiceSelect.disabled = false; | |
| appendLog(`[Frontend] Loaded ${voices.length} voice presets`); | |
| } catch (err) { | |
| console.error('Failed to load voices', err); | |
| voiceSelect.innerHTML = ''; | |
| const option = document.createElement('option'); | |
| option.value = ''; | |
| option.textContent = 'Load failed'; | |
| voiceSelect.appendChild(option); | |
| voiceSelect.disabled = true; | |
| appendLog('[Error] Failed to load voice presets'); | |
| } | |
| }; | |
| loadVoices(); | |
| resetBtn.addEventListener('click', () => { | |
| cfgSelect.value = '1.5'; | |
| stepsSelect.value = '5'; | |
| updateCfgDisplay(); | |
| updateStepsDisplay(); | |
| appendLog('[Frontend] Controls reset to defaults (CFG=1.5, Steps=5)'); | |
| }); | |
| if (promptInput) { | |
| promptInput.addEventListener('input', () => { | |
| if (previewActive) { | |
| updateStreamingPreview(); | |
| } | |
| }); | |
| } | |
| const handleLogMessage = raw => { | |
| let payload; | |
| try { | |
| payload = JSON.parse(raw); | |
| } catch (err) { | |
| appendLog(`[Error] Failed to parse log message: ${raw}`); | |
| return; | |
| } | |
| if (!payload || payload.type !== 'log') { | |
| appendLog(`[Log] ${raw}`); | |
| return; | |
| } | |
| const { event, data = {}, timestamp } = payload; | |
| switch (event) { | |
| case 'backend_request_received': { | |
| const cfg = typeof data.cfg_scale === 'number' ? data.cfg_scale.toFixed(3) : data.cfg_scale; | |
| const steps = data.inference_steps ?? 'default'; | |
| const voice = data.voice || 'default'; | |
| const textLength = data.text_length ?? 0; | |
| appendLog(`[Backend] Received request`, timestamp); | |
| break; | |
| } | |
| case 'backend_first_chunk_sent': | |
| appendLog('[Backend] Sent first audio chunk', timestamp); | |
| break; | |
| case 'model_progress': | |
| if (typeof data.generated_sec !== 'undefined') { | |
| const generated = Number(data.generated_sec); | |
| if (Number.isFinite(generated)) { | |
| setModelGenerated(generated); | |
| } | |
| } | |
| return; | |
| case 'generation_error': | |
| appendLog(`[Error] Generation error: ${data.message || 'Unknown error'}`, timestamp); | |
| break; | |
| case 'backend_error': | |
| appendLog(`[Error] Backend error: ${data.message || 'Unknown error'}`, timestamp); | |
| break; | |
| case 'client_disconnected': | |
| appendLog('[Frontend] Client disconnected', timestamp); | |
| break; | |
| case 'backend_stream_complete': | |
| appendLog('[Backend] Backend finished', timestamp); | |
| recordingComplete = true; | |
| updateSaveButtonState(); | |
| break; | |
| default: | |
| appendLog(`[Log] Event ${event}`, timestamp); | |
| break; | |
| } | |
| }; | |
| const updateButtonLabel = () => { | |
| controlBtn.textContent = isPlaying ? 'Stop' : 'Start'; | |
| controlBtn.classList.toggle('playing', isPlaying); | |
| }; | |
| const appendAudio = chunk => { | |
| const merged = new Float32Array(buffer.length + chunk.length); | |
| merged.set(buffer, 0); | |
| merged.set(chunk, buffer.length); | |
| buffer = merged; | |
| }; | |
| const pullAudio = frameCount => { | |
| const available = buffer.length; | |
| if (available === 0) { | |
| return new Float32Array(frameCount); | |
| } | |
| if (available <= frameCount) { | |
| const chunk = buffer; | |
| buffer = new Float32Array(0); | |
| if (chunk.length < frameCount) { | |
| const padded = new Float32Array(frameCount); | |
| padded.set(chunk, 0); | |
| return padded; | |
| } | |
| return chunk; | |
| } | |
| const chunk = buffer.subarray(0, frameCount); | |
| buffer = buffer.subarray(frameCount); | |
| return chunk; | |
| }; | |
| const closeSocket = () => { | |
| if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) { | |
| socket.close(); | |
| } | |
| socket = null; | |
| }; | |
| const resetPlaybackFlags = (resetSamples = true) => { | |
| buffer = new Float32Array(0); | |
| if (resetSamples) { | |
| playbackSamples = 0; | |
| setPlaybackElapsed(0); | |
| } | |
| hasStartedPlayback = false; | |
| silentFrameCount = 0; | |
| firstBrowserChunkLogged = false; | |
| playbackStartedLogged = false; | |
| }; | |
| const teardownAudio = () => { | |
| if (scriptNode) { | |
| try { scriptNode.disconnect(); } catch (err) { console.warn('disconnect error', err); } | |
| scriptNode.onaudioprocess = null; | |
| } | |
| if (audioCtx) { | |
| try { audioCtx.close(); } catch (err) { console.warn('audioCtx.close error', err); } | |
| } | |
| audioCtx = null; | |
| scriptNode = null; | |
| }; | |
| const resetState = (resetSamples = true) => { | |
| closeSocket(); | |
| teardownAudio(); | |
| resetPlaybackFlags(resetSamples); | |
| isPlaying = false; | |
| stopPlaybackTimer(); | |
| }; | |
| const createAudioChain = () => { | |
| teardownAudio(); | |
| resetPlaybackFlags(); | |
| audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE }); | |
| scriptNode = audioCtx.createScriptProcessor(BUFFER_SIZE, 0, 1); | |
| const minBufferSamples = Math.floor(audioCtx.sampleRate * PREBUFFER_SEC); | |
| scriptNode.onaudioprocess = event => { | |
| const output = event.outputBuffer.getChannelData(0); | |
| const needPrebuffer = !hasStartedPlayback; | |
| const socketClosed = !socket || socket.readyState === WebSocket.CLOSED || socket.readyState === WebSocket.CLOSING; | |
| if (needPrebuffer) { | |
| if (buffer.length >= minBufferSamples || socketClosed) { | |
| hasStartedPlayback = true; | |
| if (!playbackStartedLogged) { | |
| playbackStartedLogged = true; | |
| appendLog('[Frontend] Browser started to play audio'); | |
| startPlaybackTimer(); | |
| } | |
| } else { | |
| output.fill(0); | |
| return; | |
| } | |
| } | |
| const chunk = pullAudio(output.length); | |
| output.set(chunk); | |
| if (hasStartedPlayback) { | |
| playbackSamples += output.length; | |
| } | |
| if (socketClosed && buffer.length === 0 && chunk.every(sample => sample === 0)) { | |
| silentFrameCount += 1; | |
| if (silentFrameCount >= 4) { | |
| stop(); | |
| } | |
| } else { | |
| silentFrameCount = 0; | |
| } | |
| }; | |
| scriptNode.connect(audioCtx.destination); | |
| }; | |
| const start = () => { | |
| if (isPlaying) { | |
| return; | |
| } | |
| const textValue = promptInput?.value || ''; | |
| const cfgValue = Number(cfgSelect.value); | |
| const stepsValue = Number(stepsSelect.value); | |
| const voiceValue = voiceSelect.value || ''; | |
| clearLogs(); | |
| const cfgDisplay = Number.isFinite(cfgValue) ? cfgValue.toFixed(3) : 'default'; | |
| const stepsDisplay = Number.isFinite(stepsValue) ? stepsValue : 'default'; | |
| appendLog(`[Frontend] Start button clicked, CFG=${cfgDisplay}, Steps=${stepsDisplay}, Speaker=${voiceValue || 'default'}`); | |
| setModelGenerated(0); | |
| setPlaybackElapsed(0); | |
| resetState(true); | |
| clearRecordedChunks(); | |
| isPlaying = true; | |
| previewActive = true; | |
| updateStreamingPreview(); | |
| updateButtonLabel(); | |
| createAudioChain(); | |
| const params = new URLSearchParams(); | |
| params.set('text', textValue); | |
| if (!Number.isNaN(cfgValue)) { | |
| params.set('cfg', cfgValue.toFixed(3)); | |
| } | |
| if (!Number.isNaN(stepsValue)) { | |
| params.set('steps', stepsValue.toString()); | |
| } | |
| if (voiceValue) { | |
| params.set('voice', voiceValue); | |
| } | |
| const wsUrl = `${location.origin.replace(/^http/, 'ws')}/stream?${params.toString()}`; | |
| socket = new WebSocket(wsUrl); | |
| socket.binaryType = 'arraybuffer'; | |
| socket.onmessage = event => { | |
| if (typeof event.data === 'string') { | |
| handleLogMessage(event.data); | |
| return; | |
| } | |
| if (!(event.data instanceof ArrayBuffer)) { | |
| return; | |
| } | |
| const rawBuffer = event.data.slice(0); | |
| const view = new DataView(rawBuffer); | |
| const floatChunk = new Float32Array(view.byteLength / 2); | |
| for (let i = 0; i < floatChunk.length; i += 1) { | |
| floatChunk[i] = view.getInt16(i * 2, true) / 32768; | |
| } | |
| appendAudio(floatChunk); | |
| recordedChunks.push(rawBuffer); | |
| recordedSamples += floatChunk.length; | |
| updateSaveButtonState(); | |
| if (!firstBrowserChunkLogged) { | |
| firstBrowserChunkLogged = true; | |
| appendLog('[Frontend] Received first audio chunk'); | |
| } | |
| }; | |
| socket.onerror = err => { | |
| console.error('WebSocket error', err); | |
| appendLog(`[Error] WebSocket error: ${err?.message || err}`); | |
| stop(); | |
| }; | |
| socket.onclose = () => { | |
| socket = null; | |
| if (recordedSamples > 0) { | |
| recordingComplete = true; | |
| updateSaveButtonState(); | |
| } | |
| }; | |
| }; | |
| const stop = () => { | |
| if (!isPlaying) { | |
| resetState(false); | |
| updateButtonLabel(); | |
| return; | |
| } | |
| resetState(false); | |
| setPlaybackElapsed(Math.min(lastPlaybackElapsed, modelGeneratedTotal)); | |
| appendLog('[Frontend] Playback stopped'); | |
| if (recordedSamples > 0) { | |
| recordingComplete = true; | |
| updateSaveButtonState(); | |
| } | |
| previewActive = false; | |
| clearPreviewTimer(); | |
| streamingPreview?.classList.remove('streaming-active'); | |
| updateButtonLabel(); | |
| }; | |
| controlBtn.addEventListener('click', () => { | |
| if (isPlaying) { | |
| stop(); | |
| } else { | |
| start(); | |
| } | |
| }); | |
| if (saveBtn) { | |
| saveBtn.addEventListener('click', handleSaveClick); | |
| } | |
| updateButtonLabel(); | |
| updateSaveButtonState(); | |
| window.addEventListener('beforeunload', () => { | |
| resetState(); | |
| clearPreviewTimer(); | |
| revokeDownloadUrl(); | |
| }); | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |