Files
AquaMasterMQTT/data/tts.js
Carsten Graf 2d4831349b
All checks were successful
/ build (push) Successful in 3m58s
feat(tts): browser-based MP3 announcer for finished times
Plays the just-finished time via Web Audio API on index.html and on
new top-1 entries on leaderboard.html. Snippets are pre-rendered as
German neural-TTS MP3s (numbers 0-99 spoken naturally as
"vierzehn", "sechsundneunzig" etc.) and decoded into AudioBuffers
once at page load, then chained gaplessly via start(when, offset,
duration) — leading/trailing silence in each MP3 is detected and
skipped so words flow without pauses. A floating speaker toggle
persists in localStorage and doubles as the user gesture that
unlocks the AudioContext on autoplay-restricted browsers (SmartTV,
iOS Safari).

Hundredths formatting mirrors the ESP's float-truncation via
Math.fround so the announced value always matches the displayed
string, even at hundredths boundaries where double/float rounding
diverges. Preload runs at concurrency 2 with a 2 s start delay so
the 107 MP3 fetches don't starve /api/data and freeze the live
timer.

Regenerator script: tools/generate-tts.py (requires edge-tts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 20:21:49 +02:00

309 lines
11 KiB
JavaScript

// tts.js — Plays pre-rendered MP3 snippets from /tts/ gaplessly.
// Uses the Web Audio API: each MP3 is decoded once into an AudioBuffer,
// then chained playback is scheduled with start(when) so there is no
// JS-callback gap between snippets. Persists enable-state in
// localStorage. The toggle button doubles as the user gesture that
// unlocks the AudioContext on browsers with autoplay restrictions
// (iOS Safari, many SmartTV browsers).
(() => {
'use strict';
const BASE = '/tts/';
const STORAGE_KEY = 'aqm_tts_enabled';
// All snippets shipped in /tts/. Listed explicitly so we can
// preload (and decode) them upfront before the first announcement.
// Numbers 0-99 are spoken as natural German words ("vierzehn",
// "sechsundneunzig"), produced by tools/generate-tts.py.
const FILES = [
'bereit', 'komma', 'minute', 'minuten',
'neue_zeit', 'sekunden', 'und',
];
for (let i = 0; i < 100; i++) FILES.push(String(i));
let enabled = localStorage.getItem(STORAGE_KEY) === '1';
let audioCtx = null;
// For each name we cache { buffer, offset, duration } where offset
// and duration mark the non-silent region of the decoded buffer.
// Edge-TTS pads each MP3 with ~50-150 ms of leading/trailing
// silence, which would otherwise stack up between snippets.
const buffers = Object.create(null);
let scheduled = [];
let preloadPromise = null;
let btn = null;
// Linear amplitude threshold below which a sample counts as silence
// (~ -46 dBFS). Slightly higher than typical decoder noise floor.
const SILENCE_THRESHOLD = 0.005;
// Tiny grace at the start so we don't chop a soft consonant attack.
const LEAD_GRACE_S = 0.01;
// Scans both channels of an AudioBuffer to find the first and last
// sample whose absolute value exceeds SILENCE_THRESHOLD, returning
// {offset, duration} in seconds for use with start(when, offset,
// duration). Falls back to the full buffer if everything looks
// silent (shouldn't happen for our snippets, but be safe).
function findNonSilentRange(buffer) {
const channels = buffer.numberOfChannels;
const len = buffer.length;
const sampleRate = buffer.sampleRate;
let firstHit = len;
let lastHit = -1;
for (let c = 0; c < channels; c++) {
const data = buffer.getChannelData(c);
for (let i = 0; i < firstHit; i++) {
if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
firstHit = i;
break;
}
}
for (let i = len - 1; i > lastHit; i--) {
if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
lastHit = i;
break;
}
}
}
if (lastHit < 0 || firstHit >= len) {
return { offset: 0, duration: buffer.duration };
}
const grace = Math.floor(LEAD_GRACE_S * sampleRate);
const start = Math.max(0, firstHit - grace);
const end = Math.min(len, lastHit + 1);
return {
offset: start / sampleRate,
duration: (end - start) / sampleRate,
};
}
// Builds the spoken-snippet sequence for a duration in seconds.
// Examples (all sound natural in German):
// 14.96 -> ["14","komma","96","sekunden"] "vierzehn Komma sechsundneunzig Sekunden"
// 14.05 -> ["14","komma","0","5","sekunden"] "vierzehn Komma null fünf Sekunden"
// 14.00 -> ["14","sekunden"] "vierzehn Sekunden"
// 65.96 -> ["minute","und","5","komma","96","sekunden"]
// "eine Minute und fünf Komma sechsundneunzig Sekunden"
// 125.50 -> ["2","minuten","und","5","komma","50","sekunden"]
//
// Note on hundredths < 10: the leading zero is spoken digit-by-
// digit ("null fünf" for .05) so .05 stays distinguishable from
// .50 ("fünfzig"). For >= 10 the value is spoken as a single
// German word ("sechsundneunzig" for .96).
function timeToSeq(seconds) {
const total = Math.max(0, Number(seconds) || 0);
// Replicate the server's exact float-based formatting so the
// announcement always matches what the user sees on screen.
// The ESP (databasebackend.h, gamemodes.h) does:
// float s = timeMs / 1000.0;
// int totalSec = (int)s;
// int hundredths = (int)((s - totalSec) * 100);
// C++ `float` is single precision (24-bit mantissa); JS Number
// is double precision. For times near a hundredths boundary
// (e.g. 14.090) the two give different floor results — server
// says "14.08" but a naive double calculation announces "14.09".
// Math.fround forces single-precision rounding at each step so
// the chain matches the server bit-for-bit.
const sFloat = Math.fround(total);
const totalSec = Math.trunc(sFloat);
const minutes = Math.floor(totalSec / 60);
const remSec = totalSec % 60;
const diffFloat = Math.fround(sFloat - totalSec);
const scaledFloat = Math.fround(diffFloat * 100);
let hundredths = Math.trunc(scaledFloat);
if (hundredths < 0) hundredths = 0;
if (hundredths > 99) hundredths = 99;
const out = [];
if (minutes > 0) {
if (minutes === 1) {
out.push('minute'); // "eine Minute"
} else {
out.push(String(minutes));
out.push('minuten');
}
if (remSec > 0 || hundredths > 0) out.push('und');
}
if (remSec > 0 || hundredths > 0 || minutes === 0) {
out.push(String(remSec));
if (hundredths > 0) {
out.push('komma');
if (hundredths < 10) {
out.push('0');
out.push(String(hundredths));
} else {
out.push(String(hundredths));
}
}
out.push('sekunden');
}
return out;
}
// "12.34" or "01:23.45" -> seconds. timeToSeq() then re-rounds
// the value through Math.fround to match the server's float math,
// so the parseFloat drift is harmless here.
function parseFormattedTime(str) {
if (!str) return 0;
if (str.includes(':')) {
const [mm, rest] = str.split(':');
return parseInt(mm, 10) * 60 + parseFloat(rest);
}
return parseFloat(str) || 0;
}
function ensureContext() {
if (audioCtx) return;
const Ctx = window.AudioContext || window.webkitAudioContext;
if (!Ctx) {
console.warn('TTS: Web Audio API not supported');
return;
}
audioCtx = new Ctx();
}
// Limit concurrent fetches: the ESP's async web server only serves
// a handful of requests well in parallel, and the browser's 6-per-
// host pool would otherwise starve the 1 s /api/data poll while
// 107 MP3s come in. Two parallel fetches finishes the preload in
// ~2 s without holding up the live timer.
const PRELOAD_CONCURRENCY = 2;
async function fetchAndStore(name) {
const res = await fetch(BASE + name + '.mp3');
const arr = await res.arrayBuffer();
// Older Safari only supports the callback form of decodeAudioData.
const buffer = await new Promise((resolve, reject) => {
const p = audioCtx.decodeAudioData(arr, resolve, reject);
if (p && typeof p.then === 'function') p.then(resolve, reject);
});
const { offset, duration } = findNonSilentRange(buffer);
buffers[name] = { buffer, offset, duration };
}
function preload() {
if (preloadPromise) return preloadPromise;
ensureContext();
if (!audioCtx) return Promise.resolve();
let i = 0;
const worker = async () => {
while (i < FILES.length) {
const name = FILES[i++];
try {
await fetchAndStore(name);
} catch (e) {
console.warn('TTS preload failed:', name, e);
}
}
};
const workers = [];
for (let n = 0; n < PRELOAD_CONCURRENCY; n++) workers.push(worker());
preloadPromise = Promise.all(workers);
return preloadPromise;
}
function stop() {
scheduled.forEach(s => { try { s.stop(); } catch (_) {} });
scheduled = [];
}
function play(seq) {
if (!enabled || !seq || !seq.length) return;
ensureContext();
if (!audioCtx) return;
// Browsers may suspend the context until a user gesture. resume()
// is a no-op if already running.
if (audioCtx.state === 'suspended') {
audioCtx.resume().catch(() => {});
}
stop();
let when = audioCtx.currentTime;
seq.forEach((name) => {
const entry = buffers[name];
if (!entry) {
console.warn('TTS buffer missing:', name);
return;
}
const src = audioCtx.createBufferSource();
src.buffer = entry.buffer;
src.connect(audioCtx.destination);
// start(when, offset, duration) plays only the non-silent slice
// we identified during preload, so trailing silence in each
// snippet doesn't stack up between words.
src.start(when, entry.offset, entry.duration);
scheduled.push(src);
when += entry.duration;
});
}
function setEnabled(on) {
enabled = !!on;
localStorage.setItem(STORAGE_KEY, enabled ? '1' : '0');
if (!enabled) stop();
updateToggleUI();
}
function updateToggleUI() {
if (!btn) return;
btn.textContent = enabled ? '🔊' : '🔇';
btn.title = enabled ? 'Ansagen deaktivieren' : 'Ansagen aktivieren';
btn.setAttribute('aria-pressed', enabled ? 'true' : 'false');
}
function injectToggle() {
if (btn || !document.body) return;
btn = document.createElement('button');
btn.id = 'tts-toggle';
btn.style.cssText =
'position:fixed;bottom:14px;right:14px;z-index:9998;' +
'width:54px;height:54px;border-radius:50%;border:none;' +
'background:rgba(0,0,0,0.55);color:#fff;font-size:24px;' +
'cursor:pointer;display:flex;align-items:center;justify-content:center;' +
'box-shadow:0 2px 10px rgba(0,0,0,0.35);';
btn.addEventListener('click', async () => {
const next = !enabled;
setEnabled(next);
if (next) {
// The click itself is the user gesture that lets us create
// and resume the AudioContext. Preload then play a short ack.
await preload();
play(['bereit']);
}
});
document.body.appendChild(btn);
updateToggleUI();
}
// Decode buffers in the background. On a fresh page load the
// AudioContext is created in suspended state (no audio yet, just
// decoding — allowed without a gesture), so the first real
// announcement after the user clicks anywhere is already gapless.
// Defer the start so the initial render and the first /api/data
// poll go through unimpeded — otherwise the ESP's web server is
// busy serving MP3s and the live timer freezes for several seconds.
function eagerPreload() {
if (!enabled) return;
setTimeout(preload, 2000);
}
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', () => {
injectToggle();
eagerPreload();
});
} else {
injectToggle();
eagerPreload();
}
window.tts = {
isEnabled: () => enabled,
setEnabled,
play,
stop,
timeToSeq,
parseFormattedTime,
sayTime: (sec) => play(['neue_zeit', ...timeToSeq(sec)]),
};
})();