AquaMasterMQTT/data/tts.js

// tts.js — Plays pre-rendered MP3 snippets from /tts/ gaplessly.
// Uses the Web Audio API: each MP3 is decoded once into an AudioBuffer,
// then chained playback is scheduled with start(when) so there is no
// JS-callback gap between snippets. Persists enable-state in
// localStorage. The toggle button doubles as the user gesture that
// unlocks the AudioContext on browsers with autoplay restrictions
// (iOS Safari, many SmartTV browsers).
(() => {
  'use strict';

  const BASE = '/tts/';
  const STORAGE_KEY = 'aqm_tts_enabled';
  // All snippets shipped in /tts/. Listed explicitly so we can
  // preload (and decode) them upfront before the first announcement.
  // Numbers 0-99 are spoken as natural German words ("vierzehn",
  // "sechsundneunzig"), produced by tools/generate-tts.py.
  const FILES = [
    'bereit', 'komma', 'minute', 'minuten',
    'neue_zeit', 'sekunden', 'und',
  ];
  for (let i = 0; i < 100; i++) FILES.push(String(i));

  let enabled = localStorage.getItem(STORAGE_KEY) === '1';
  let audioCtx = null;
  // For each name we cache { buffer, offset, duration } where offset
  // and duration mark the non-silent region of the decoded buffer.
  // Edge-TTS pads each MP3 with ~50-150 ms of leading/trailing
  // silence, which would otherwise stack up between snippets.
  const buffers = Object.create(null);
  let scheduled = [];
  let preloadPromise = null;
  let btn = null;

  // Linear amplitude threshold below which a sample counts as silence
  // (~ -46 dBFS). Slightly higher than typical decoder noise floor.
  const SILENCE_THRESHOLD = 0.005;
  // Tiny grace at the start so we don't chop a soft consonant attack.
  const LEAD_GRACE_S = 0.01;

  // Scans both channels of an AudioBuffer to find the first and last
  // sample whose absolute value exceeds SILENCE_THRESHOLD, returning
  // {offset, duration} in seconds for use with start(when, offset,
  // duration). Falls back to the full buffer if everything looks
  // silent (shouldn't happen for our snippets, but be safe).
  function findNonSilentRange(buffer) {
    const channels = buffer.numberOfChannels;
    const len = buffer.length;
    const sampleRate = buffer.sampleRate;
    let firstHit = len;
    let lastHit = -1;
    for (let c = 0; c < channels; c++) {
      const data = buffer.getChannelData(c);
      for (let i = 0; i < firstHit; i++) {
        if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
          firstHit = i;
          break;
        }
      }
      for (let i = len - 1; i > lastHit; i--) {
        if (Math.abs(data[i]) >= SILENCE_THRESHOLD) {
          lastHit = i;
          break;
        }
      }
    }
    if (lastHit < 0 || firstHit >= len) {
      return { offset: 0, duration: buffer.duration };
    }
    const grace = Math.floor(LEAD_GRACE_S * sampleRate);
    const start = Math.max(0, firstHit - grace);
    const end = Math.min(len, lastHit + 1);
    return {
      offset: start / sampleRate,
      duration: (end - start) / sampleRate,
    };
  }

  // Builds the spoken-snippet sequence for a duration in seconds.
  // Examples (all sound natural in German):
  //   14.96 -> ["14","komma","96","sekunden"]      "vierzehn Komma sechsundneunzig Sekunden"
  //   14.05 -> ["14","komma","0","5","sekunden"]   "vierzehn Komma null fünf Sekunden"
  //   14.00 -> ["14","sekunden"]                   "vierzehn Sekunden"
  //   65.96 -> ["minute","und","5","komma","96","sekunden"]
  //                                                "eine Minute und fünf Komma sechsundneunzig Sekunden"
  //   125.50 -> ["2","minuten","und","5","komma","50","sekunden"]
  //
  // Note on hundredths < 10: the leading zero is spoken digit-by-
  // digit ("null fünf" for .05) so .05 stays distinguishable from
  // .50 ("fünfzig"). For >= 10 the value is spoken as a single
  // German word ("sechsundneunzig" for .96).
  function timeToSeq(seconds) {
    const total = Math.max(0, Number(seconds) || 0);
    // Replicate the server's exact float-based formatting so the
    // announcement always matches what the user sees on screen.
    // The ESP (databasebackend.h, gamemodes.h) does:
    //   float s = timeMs / 1000.0;
    //   int totalSec = (int)s;
    //   int hundredths = (int)((s - totalSec) * 100);
    // C++ `float` is single precision (24-bit mantissa); JS Number
    // is double precision. For times near a hundredths boundary
    // (e.g. 14.090) the two give different floor results — server
    // says "14.08" but a naive double calculation announces "14.09".
    // Math.fround forces single-precision rounding at each step so
    // the chain matches the server bit-for-bit.
    const sFloat = Math.fround(total);
    const totalSec = Math.trunc(sFloat);
    const minutes = Math.floor(totalSec / 60);
    const remSec = totalSec % 60;
    const diffFloat = Math.fround(sFloat - totalSec);
    const scaledFloat = Math.fround(diffFloat * 100);
    let hundredths = Math.trunc(scaledFloat);
    if (hundredths < 0) hundredths = 0;
    if (hundredths > 99) hundredths = 99;

    const out = [];

    if (minutes > 0) {
      if (minutes === 1) {
        out.push('minute'); // "eine Minute"
      } else {
        out.push(String(minutes));
        out.push('minuten');
      }
      if (remSec > 0 || hundredths > 0) out.push('und');
    }

    if (remSec > 0 || hundredths > 0 || minutes === 0) {
      out.push(String(remSec));
      if (hundredths > 0) {
        out.push('komma');
        if (hundredths < 10) {
          out.push('0');
          out.push(String(hundredths));
        } else {
          out.push(String(hundredths));
        }
      }
      out.push('sekunden');
    }
    return out;
  }

  // "12.34" or "01:23.45" -> seconds. timeToSeq() then re-rounds
  // the value through Math.fround to match the server's float math,
  // so the parseFloat drift is harmless here.
  function parseFormattedTime(str) {
    if (!str) return 0;
    if (str.includes(':')) {
      const [mm, rest] = str.split(':');
      return parseInt(mm, 10) * 60 + parseFloat(rest);
    }
    return parseFloat(str) || 0;
  }

  function ensureContext() {
    if (audioCtx) return;
    const Ctx = window.AudioContext || window.webkitAudioContext;
    if (!Ctx) {
      console.warn('TTS: Web Audio API not supported');
      return;
    }
    audioCtx = new Ctx();
  }

  // Limit concurrent fetches: the ESP's async web server only serves
  // a handful of requests well in parallel, and the browser's 6-per-
  // host pool would otherwise starve the 1 s /api/data poll while
  // 107 MP3s come in. Two parallel fetches finishes the preload in
  // ~2 s without holding up the live timer.
  const PRELOAD_CONCURRENCY = 2;

  async function fetchAndStore(name) {
    const res = await fetch(BASE + name + '.mp3');
    const arr = await res.arrayBuffer();
    // Older Safari only supports the callback form of decodeAudioData.
    const buffer = await new Promise((resolve, reject) => {
      const p = audioCtx.decodeAudioData(arr, resolve, reject);
      if (p && typeof p.then === 'function') p.then(resolve, reject);
    });
    const { offset, duration } = findNonSilentRange(buffer);
    buffers[name] = { buffer, offset, duration };
  }

  function preload() {
    if (preloadPromise) return preloadPromise;
    ensureContext();
    if (!audioCtx) return Promise.resolve();
    let i = 0;
    const worker = async () => {
      while (i < FILES.length) {
        const name = FILES[i++];
        try {
          await fetchAndStore(name);
        } catch (e) {
          console.warn('TTS preload failed:', name, e);
        }
      }
    };
    const workers = [];
    for (let n = 0; n < PRELOAD_CONCURRENCY; n++) workers.push(worker());
    preloadPromise = Promise.all(workers);
    return preloadPromise;
  }

  function stop() {
    scheduled.forEach(s => { try { s.stop(); } catch (_) {} });
    scheduled = [];
  }

  function play(seq) {
    if (!enabled || !seq || !seq.length) return;
    ensureContext();
    if (!audioCtx) return;
    // Browsers may suspend the context until a user gesture. resume()
    // is a no-op if already running.
    if (audioCtx.state === 'suspended') {
      audioCtx.resume().catch(() => {});
    }
    stop();
    let when = audioCtx.currentTime;
    seq.forEach((name) => {
      const entry = buffers[name];
      if (!entry) {
        console.warn('TTS buffer missing:', name);
        return;
      }
      const src = audioCtx.createBufferSource();
      src.buffer = entry.buffer;
      src.connect(audioCtx.destination);
      // start(when, offset, duration) plays only the non-silent slice
      // we identified during preload, so trailing silence in each
      // snippet doesn't stack up between words.
      src.start(when, entry.offset, entry.duration);
      scheduled.push(src);
      when += entry.duration;
    });
  }

  function setEnabled(on) {
    enabled = !!on;
    localStorage.setItem(STORAGE_KEY, enabled ? '1' : '0');
    if (!enabled) stop();
    updateToggleUI();
  }

  function updateToggleUI() {
    if (!btn) return;
    btn.textContent = enabled ? '🔊' : '🔇';
    btn.title = enabled ? 'Ansagen deaktivieren' : 'Ansagen aktivieren';
    btn.setAttribute('aria-pressed', enabled ? 'true' : 'false');
  }

  function injectToggle() {
    if (btn || !document.body) return;
    btn = document.createElement('button');
    btn.id = 'tts-toggle';
    btn.style.cssText =
      'position:fixed;bottom:14px;right:14px;z-index:9998;' +
      'width:54px;height:54px;border-radius:50%;border:none;' +
      'background:rgba(0,0,0,0.55);color:#fff;font-size:24px;' +
      'cursor:pointer;display:flex;align-items:center;justify-content:center;' +
      'box-shadow:0 2px 10px rgba(0,0,0,0.35);';
    btn.addEventListener('click', async () => {
      const next = !enabled;
      setEnabled(next);
      if (next) {
        // The click itself is the user gesture that lets us create
        // and resume the AudioContext. Preload then play a short ack.
        await preload();
        play(['bereit']);
      }
    });
    document.body.appendChild(btn);
    updateToggleUI();
  }

  // Decode buffers in the background. On a fresh page load the
  // AudioContext is created in suspended state (no audio yet, just
  // decoding — allowed without a gesture), so the first real
  // announcement after the user clicks anywhere is already gapless.
  // Defer the start so the initial render and the first /api/data
  // poll go through unimpeded — otherwise the ESP's web server is
  // busy serving MP3s and the live timer freezes for several seconds.
  function eagerPreload() {
    if (!enabled) return;
    setTimeout(preload, 2000);
  }

  if (document.readyState === 'loading') {
    document.addEventListener('DOMContentLoaded', () => {
      injectToggle();
      eagerPreload();
    });
  } else {
    injectToggle();
    eagerPreload();
  }

  window.tts = {
    isEnabled: () => enabled,
    setEnabled,
    play,
    stop,
    timeToSeq,
    parseFormattedTime,
    sayTime: (sec) => play(['neue_zeit', ...timeToSeq(sec)]),
  };
})();