All checks were successful
/ build (push) Successful in 3m58s
Plays the just-finished time via Web Audio API on index.html and on new top-1 entries on leaderboard.html. Snippets are pre-rendered as German neural-TTS MP3s (numbers 0-99 spoken naturally as "vierzehn", "sechsundneunzig" etc.) and decoded into AudioBuffers once at page load, then chained gaplessly via start(when, offset, duration) — leading/trailing silence in each MP3 is detected and skipped so words flow without pauses. A floating speaker toggle persists in localStorage and doubles as the user gesture that unlocks the AudioContext on autoplay-restricted browsers (SmartTV, iOS Safari). Hundredths formatting mirrors the ESP's float-truncation via Math.fround so the announced value always matches the displayed string, even at hundredths boundaries where double/float rounding diverges. Preload runs at concurrency 2 with a 2 s start delay so the 107 MP3 fetches don't starve /api/data and freeze the live timer. Regenerator script: tools/generate-tts.py (requires edge-tts). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
309 lines
11 KiB
JavaScript
309 lines
11 KiB
JavaScript
// tts.js — Plays pre-rendered MP3 snippets from /tts/ gaplessly.
|
|
// Uses the Web Audio API: each MP3 is decoded once into an AudioBuffer,
|
|
// then chained playback is scheduled with start(when) so there is no
|
|
// JS-callback gap between snippets. Persists enable-state in
|
|
// localStorage. The toggle button doubles as the user gesture that
|
|
// unlocks the AudioContext on browsers with autoplay restrictions
|
|
// (iOS Safari, many SmartTV browsers).
|
|
(() => {
|
|
'use strict';
|
|
|
|
const BASE = '/tts/';
|
|
const STORAGE_KEY = 'aqm_tts_enabled';
|
|
// All snippets shipped in /tts/. Listed explicitly so we can
|
|
// preload (and decode) them upfront before the first announcement.
|
|
// Numbers 0-99 are spoken as natural German words ("vierzehn",
|
|
// "sechsundneunzig"), produced by tools/generate-tts.py.
|
|
const FILES = [
|
|
'bereit', 'komma', 'minute', 'minuten',
|
|
'neue_zeit', 'sekunden', 'und',
|
|
];
|
|
for (let i = 0; i < 100; i++) FILES.push(String(i));
|
|
|
|
let enabled = localStorage.getItem(STORAGE_KEY) === '1';
|
|
let audioCtx = null;
|
|
// For each name we cache { buffer, offset, duration } where offset
|
|
// and duration mark the non-silent region of the decoded buffer.
|
|
// Edge-TTS pads each MP3 with ~50-150 ms of leading/trailing
|
|
// silence, which would otherwise stack up between snippets.
|
|
const buffers = Object.create(null);
|
|
let scheduled = [];
|
|
let preloadPromise = null;
|
|
let btn = null;
|
|
|
|
// Linear amplitude threshold below which a sample counts as silence
|
|
// (~ -46 dBFS). Slightly higher than typical decoder noise floor.
|
|
const SILENCE_THRESHOLD = 0.005;
|
|
// Tiny grace at the start so we don't chop a soft consonant attack.
|
|
const LEAD_GRACE_S = 0.01;
|
|
|
|
// Scans both channels of an AudioBuffer to find the first and last
|
|
// sample whose absolute value exceeds SILENCE_THRESHOLD, returning
|
|
// {offset, duration} in seconds for use with start(when, offset,
|
|
// duration). Falls back to the full buffer if everything looks
|
|
// silent (shouldn't happen for our snippets, but be safe).
|
|
function findNonSilentRange(buffer) {
|
|
const channels = buffer.numberOfChannels;
|
|
const len = buffer.length;
|
|
const sampleRate = buffer.sampleRate;
|
|
let firstHit = len;
|
|
let lastHit = -1;
|
|
for (let c = 0; c < channels; c++) {
|
|
const data = buffer.getChannelData(c);
|
|
for (let i = 0; i < firstHit; i++) {
|
|
if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
|
|
firstHit = i;
|
|
break;
|
|
}
|
|
}
|
|
for (let i = len - 1; i > lastHit; i--) {
|
|
if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
|
|
lastHit = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (lastHit < 0 || firstHit >= len) {
|
|
return { offset: 0, duration: buffer.duration };
|
|
}
|
|
const grace = Math.floor(LEAD_GRACE_S * sampleRate);
|
|
const start = Math.max(0, firstHit - grace);
|
|
const end = Math.min(len, lastHit + 1);
|
|
return {
|
|
offset: start / sampleRate,
|
|
duration: (end - start) / sampleRate,
|
|
};
|
|
}
|
|
|
|
// Builds the spoken-snippet sequence for a duration in seconds.
|
|
// Examples (all sound natural in German):
|
|
// 14.96 -> ["14","komma","96","sekunden"] "vierzehn Komma sechsundneunzig Sekunden"
|
|
// 14.05 -> ["14","komma","0","5","sekunden"] "vierzehn Komma null fünf Sekunden"
|
|
// 14.00 -> ["14","sekunden"] "vierzehn Sekunden"
|
|
// 65.96 -> ["minute","und","5","komma","96","sekunden"]
|
|
// "eine Minute und fünf Komma sechsundneunzig Sekunden"
|
|
// 125.50 -> ["2","minuten","und","5","komma","50","sekunden"]
|
|
//
|
|
// Note on hundredths < 10: the leading zero is spoken digit-by-
|
|
// digit ("null fünf" for .05) so .05 stays distinguishable from
|
|
// .50 ("fünfzig"). For >= 10 the value is spoken as a single
|
|
// German word ("sechsundneunzig" for .96).
|
|
function timeToSeq(seconds) {
|
|
const total = Math.max(0, Number(seconds) || 0);
|
|
// Replicate the server's exact float-based formatting so the
|
|
// announcement always matches what the user sees on screen.
|
|
// The ESP (databasebackend.h, gamemodes.h) does:
|
|
// float s = timeMs / 1000.0;
|
|
// int totalSec = (int)s;
|
|
// int hundredths = (int)((s - totalSec) * 100);
|
|
// C++ `float` is single precision (24-bit mantissa); JS Number
|
|
// is double precision. For times near a hundredths boundary
|
|
// (e.g. 14.090) the two give different floor results — server
|
|
// says "14.08" but a naive double calculation announces "14.09".
|
|
// Math.fround forces single-precision rounding at each step so
|
|
// the chain matches the server bit-for-bit.
|
|
const sFloat = Math.fround(total);
|
|
const totalSec = Math.trunc(sFloat);
|
|
const minutes = Math.floor(totalSec / 60);
|
|
const remSec = totalSec % 60;
|
|
const diffFloat = Math.fround(sFloat - totalSec);
|
|
const scaledFloat = Math.fround(diffFloat * 100);
|
|
let hundredths = Math.trunc(scaledFloat);
|
|
if (hundredths < 0) hundredths = 0;
|
|
if (hundredths > 99) hundredths = 99;
|
|
|
|
const out = [];
|
|
|
|
if (minutes > 0) {
|
|
if (minutes === 1) {
|
|
out.push('minute'); // "eine Minute"
|
|
} else {
|
|
out.push(String(minutes));
|
|
out.push('minuten');
|
|
}
|
|
if (remSec > 0 || hundredths > 0) out.push('und');
|
|
}
|
|
|
|
if (remSec > 0 || hundredths > 0 || minutes === 0) {
|
|
out.push(String(remSec));
|
|
if (hundredths > 0) {
|
|
out.push('komma');
|
|
if (hundredths < 10) {
|
|
out.push('0');
|
|
out.push(String(hundredths));
|
|
} else {
|
|
out.push(String(hundredths));
|
|
}
|
|
}
|
|
out.push('sekunden');
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// "12.34" or "01:23.45" -> seconds. timeToSeq() then re-rounds
|
|
// the value through Math.fround to match the server's float math,
|
|
// so the parseFloat drift is harmless here.
|
|
function parseFormattedTime(str) {
|
|
if (!str) return 0;
|
|
if (str.includes(':')) {
|
|
const [mm, rest] = str.split(':');
|
|
return parseInt(mm, 10) * 60 + parseFloat(rest);
|
|
}
|
|
return parseFloat(str) || 0;
|
|
}
|
|
|
|
function ensureContext() {
|
|
if (audioCtx) return;
|
|
const Ctx = window.AudioContext || window.webkitAudioContext;
|
|
if (!Ctx) {
|
|
console.warn('TTS: Web Audio API not supported');
|
|
return;
|
|
}
|
|
audioCtx = new Ctx();
|
|
}
|
|
|
|
// Limit concurrent fetches: the ESP's async web server only serves
|
|
// a handful of requests well in parallel, and the browser's 6-per-
|
|
// host pool would otherwise starve the 1 s /api/data poll while
|
|
// 107 MP3s come in. Two parallel fetches finishes the preload in
|
|
// ~2 s without holding up the live timer.
|
|
const PRELOAD_CONCURRENCY = 2;
|
|
|
|
async function fetchAndStore(name) {
|
|
const res = await fetch(BASE + name + '.mp3');
|
|
const arr = await res.arrayBuffer();
|
|
// Older Safari only supports the callback form of decodeAudioData.
|
|
const buffer = await new Promise((resolve, reject) => {
|
|
const p = audioCtx.decodeAudioData(arr, resolve, reject);
|
|
if (p && typeof p.then === 'function') p.then(resolve, reject);
|
|
});
|
|
const { offset, duration } = findNonSilentRange(buffer);
|
|
buffers[name] = { buffer, offset, duration };
|
|
}
|
|
|
|
function preload() {
|
|
if (preloadPromise) return preloadPromise;
|
|
ensureContext();
|
|
if (!audioCtx) return Promise.resolve();
|
|
let i = 0;
|
|
const worker = async () => {
|
|
while (i < FILES.length) {
|
|
const name = FILES[i++];
|
|
try {
|
|
await fetchAndStore(name);
|
|
} catch (e) {
|
|
console.warn('TTS preload failed:', name, e);
|
|
}
|
|
}
|
|
};
|
|
const workers = [];
|
|
for (let n = 0; n < PRELOAD_CONCURRENCY; n++) workers.push(worker());
|
|
preloadPromise = Promise.all(workers);
|
|
return preloadPromise;
|
|
}
|
|
|
|
function stop() {
|
|
scheduled.forEach(s => { try { s.stop(); } catch (_) {} });
|
|
scheduled = [];
|
|
}
|
|
|
|
function play(seq) {
|
|
if (!enabled || !seq || !seq.length) return;
|
|
ensureContext();
|
|
if (!audioCtx) return;
|
|
// Browsers may suspend the context until a user gesture. resume()
|
|
// is a no-op if already running.
|
|
if (audioCtx.state === 'suspended') {
|
|
audioCtx.resume().catch(() => {});
|
|
}
|
|
stop();
|
|
let when = audioCtx.currentTime;
|
|
seq.forEach((name) => {
|
|
const entry = buffers[name];
|
|
if (!entry) {
|
|
console.warn('TTS buffer missing:', name);
|
|
return;
|
|
}
|
|
const src = audioCtx.createBufferSource();
|
|
src.buffer = entry.buffer;
|
|
src.connect(audioCtx.destination);
|
|
// start(when, offset, duration) plays only the non-silent slice
|
|
// we identified during preload, so trailing silence in each
|
|
// snippet doesn't stack up between words.
|
|
src.start(when, entry.offset, entry.duration);
|
|
scheduled.push(src);
|
|
when += entry.duration;
|
|
});
|
|
}
|
|
|
|
function setEnabled(on) {
|
|
enabled = !!on;
|
|
localStorage.setItem(STORAGE_KEY, enabled ? '1' : '0');
|
|
if (!enabled) stop();
|
|
updateToggleUI();
|
|
}
|
|
|
|
function updateToggleUI() {
|
|
if (!btn) return;
|
|
btn.textContent = enabled ? '🔊' : '🔇';
|
|
btn.title = enabled ? 'Ansagen deaktivieren' : 'Ansagen aktivieren';
|
|
btn.setAttribute('aria-pressed', enabled ? 'true' : 'false');
|
|
}
|
|
|
|
function injectToggle() {
|
|
if (btn || !document.body) return;
|
|
btn = document.createElement('button');
|
|
btn.id = 'tts-toggle';
|
|
btn.style.cssText =
|
|
'position:fixed;bottom:14px;right:14px;z-index:9998;' +
|
|
'width:54px;height:54px;border-radius:50%;border:none;' +
|
|
'background:rgba(0,0,0,0.55);color:#fff;font-size:24px;' +
|
|
'cursor:pointer;display:flex;align-items:center;justify-content:center;' +
|
|
'box-shadow:0 2px 10px rgba(0,0,0,0.35);';
|
|
btn.addEventListener('click', async () => {
|
|
const next = !enabled;
|
|
setEnabled(next);
|
|
if (next) {
|
|
// The click itself is the user gesture that lets us create
|
|
// and resume the AudioContext. Preload then play a short ack.
|
|
await preload();
|
|
play(['bereit']);
|
|
}
|
|
});
|
|
document.body.appendChild(btn);
|
|
updateToggleUI();
|
|
}
|
|
|
|
// Decode buffers in the background. On a fresh page load the
|
|
// AudioContext is created in suspended state (no audio yet, just
|
|
// decoding — allowed without a gesture), so the first real
|
|
// announcement after the user clicks anywhere is already gapless.
|
|
// Defer the start so the initial render and the first /api/data
|
|
// poll go through unimpeded — otherwise the ESP's web server is
|
|
// busy serving MP3s and the live timer freezes for several seconds.
|
|
function eagerPreload() {
|
|
if (!enabled) return;
|
|
setTimeout(preload, 2000);
|
|
}
|
|
|
|
if (document.readyState === 'loading') {
|
|
document.addEventListener('DOMContentLoaded', () => {
|
|
injectToggle();
|
|
eagerPreload();
|
|
});
|
|
} else {
|
|
injectToggle();
|
|
eagerPreload();
|
|
}
|
|
|
|
window.tts = {
|
|
isEnabled: () => enabled,
|
|
setEnabled,
|
|
play,
|
|
stop,
|
|
timeToSeq,
|
|
parseFormattedTime,
|
|
sayTime: (sec) => play(['neue_zeit', ...timeToSeq(sec)]),
|
|
};
|
|
})();
|