#!/usr/bin/env bash
set -euo pipefail

# Offline Chinese TTS via sherpa-onnx (Matcha + Vocos) and play immediately.
# Usage:
#   vits_speak.sh "text..."
#   vits_speak.sh --in text.txt
# Options:
#   --volume 0-100   (default 100)

BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$BASE_DIR/.venv_sherpa_tts"
MODEL_DIR="$BASE_DIR/tts/sherpa_onnx/matcha-icefall-zh-baker"
VOCOS="$BASE_DIR/tts/sherpa_onnx/vocos-22khz-univ.onnx"

IN_FILE=""
TEXT=""
VOLUME_PCT="100"

while [[ $# -gt 0 ]]; do
  case "$1" in
    --in)
      IN_FILE="$2"; shift 2;;
    --volume)
      VOLUME_PCT="$2"; shift 2;;
    -h|--help)
      sed -n '1,120p' "$0"; exit 0;;
    *)
      if [[ -z "$TEXT" && -z "$IN_FILE" ]]; then
        TEXT="$1"; shift
      else
        echo "Unknown arg: $1" >&2
        exit 2
      fi
      ;;
  esac
done

TMP_TXT=""
if [[ -n "$IN_FILE" ]]; then
  TMP_TXT="$IN_FILE"
else
  TMP_TXT="$(mktemp)"
  printf "%s\n" "$TEXT" > "$TMP_TXT"
fi

OUT_WAV="$BASE_DIR/tts/sherpa_onnx/out_$(date +%Y%m%d_%H%M%S).wav"

"$VENV/bin/python" - "$TMP_TXT" "$OUT_WAV" <<'PY'
import sys
import sherpa_onnx

in_path, out_path = sys.argv[1], sys.argv[2]
text = open(in_path, 'r', encoding='utf-8', errors='ignore').read().strip()

# --- Preprocess for more natural Chinese TTS & fewer OOV tokens ---
import re
_digit_map = {'0':'零','1':'一','2':'二','3':'三','4':'四','5':'五','6':'六','7':'七','8':'八','9':'九'}

def year_to_zh(m: re.Match) -> str:
    y = m.group(1)
    return ''.join(_digit_map.get(ch, ch) for ch in y)

# 2026 -> 二零二六 (standalone year)
text = re.sub(r'(?<!\d)((?:19|20)\d{2})(?=\s*年\b|\b)', lambda m: year_to_zh(m), text)

# Replace book title brackets
text = text.replace('《', '“').replace('》', '”')

# Common proper nouns / acronyms -> Chinese
repl = {
    'BBC': '英国广播公司',
    'BP': '英国石油公司',
    'APEC': '亚太经合组织',
    'EU': '欧盟',
    'NATO': '北约',
    'UN': '联合国',
    'USA': '美国',
    'US': '美国',
    'UK': '英国',
}
for k, v in repl.items():
    # Avoid relying on \b because Chinese characters count as \w in Unicode.
    text = re.sub(rf'(?<![A-Za-z]){re.escape(k)}(?![A-Za-z])', v, text)

# Specific names (simple, robust)
text = re.sub(r'Emmanuel\s+Macron', '马克龙', text)

# Reduce potential tokenization quirks (e.g., rare pinyin tokens)
text = text.replace('谁', '哪位')

# Add clearer breathing points: break lines after sentence-ending punctuation
text = re.sub(r'([。！？.!?])\s*', r"\1\n", text)
# Also encourage short pauses after key enumerators
text = re.sub(r'(第一条|第二条|第三条|第四条|第五条|第六条)\s*', r"\1\n", text)

# Clean repeated spaces/newlines
text = re.sub(r'[ \t]{2,}', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)

# Matcha acoustic model + Vocos vocoder
base = "/home/kuhnn/.openclaw/workspace/tts/sherpa_onnx"
model_dir = base + "/matcha-icefall-zh-baker"
vocos = base + "/vocos-22khz-univ.onnx"

tts = sherpa_onnx.OfflineTts(
    sherpa_onnx.OfflineTtsConfig(
        model=sherpa_onnx.OfflineTtsModelConfig(
            matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
                acoustic_model=model_dir + "/model-steps-3.onnx",
                tokens=model_dir + "/tokens.txt",
                lexicon=model_dir + "/lexicon.txt",
                dict_dir=model_dir + "/dict",
                data_dir=model_dir,
                vocoder=vocos,
            )
        ),
        max_num_sentences=2,
    )
)

audio = tts.generate(text, sid=0, speed=1.0)
# audio.samples is float32 in [-1, 1]
import numpy as np
import soundfile as sf
samples = np.asarray(audio.samples, dtype=np.float32)
sf.write(out_path, samples, audio.sample_rate)
print(out_path)
PY

# Set system volume to desired level (best-effort)
if command -v pactl >/dev/null 2>&1; then
  pactl set-sink-volume @DEFAULT_SINK@ "${VOLUME_PCT}%" || true
fi
if command -v amixer >/dev/null 2>&1; then
  amixer -q set Master "${VOLUME_PCT}%" || true
fi

# Play (best-effort)
if command -v paplay >/dev/null 2>&1; then
  paplay "$OUT_WAV"
elif command -v mpv >/dev/null 2>&1; then
  mpv --no-video "$OUT_WAV" >/dev/null 2>&1
elif command -v aplay >/dev/null 2>&1; then
  aplay "$OUT_WAV" >/dev/null 2>&1
else
  echo "No audio player found (paplay/mpv/aplay). WAV at: $OUT_WAV" >&2
  exit 3
fi

echo "$OUT_WAV"

if [[ -z "$IN_FILE" ]]; then
  rm -f "$TMP_TXT"
fi
