#!/usr/bin/env bash
set -euo pipefail

# transcribe_whispercpp.sh — local speech-to-text via whisper.cpp (with ffmpeg decode)
#
# Usage:
#   transcribe_whispercpp.sh <audio-file> [--out /path/to/out.txt] [--model /path/to/ggml-*.bin]
#                              [--lang auto|zh|en] [--prompt "..."] [--srt] [--vtt] [--vad]
#
# Notes:
# - For Telegram/WhatsApp voice notes (ogg/opus), we first decode to 16kHz mono PCM WAV.
# - whisper.cpp binary expected at: /home/kuhnn/whisper.cpp/build/bin/whisper-cli

usage() {
  cat >&2 <<'EOF'
Usage:
  transcribe_whispercpp.sh <audio-file> [--out /path/to/out.txt] [--model /path/to/model.bin]
                           [--lang auto|zh|en] [--prompt "..."] [--srt] [--vtt] [--vad]
EOF
  exit 2
}

if [[ "${1:-}" == "" || "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
  usage
fi

IN="$1"; shift || true

WHISPER_CLI="/home/kuhnn/whisper.cpp/build/bin/whisper-cli"
DEFAULT_MODEL="/home/kuhnn/whisper.cpp/models/ggml-base.bin"
LANG="auto"
PROMPT=""
OUT=""
WANT_SRT=false
WANT_VTT=false
USE_VAD=false
MODEL="$DEFAULT_MODEL"

while [[ $# -gt 0 ]]; do
  case "$1" in
    --out)
      OUT="${2:-}"; shift 2;;
    --model)
      MODEL="${2:-}"; shift 2;;
    --lang|--language)
      LANG="${2:-}"; shift 2;;
    --prompt)
      PROMPT="${2:-}"; shift 2;;
    --srt)
      WANT_SRT=true; shift 1;;
    --vtt)
      WANT_VTT=true; shift 1;;
    --vad)
      # whisper.cpp VAD requires an explicit VAD model path (silero) via -vm/--vad-model.
      # We'll default to the repo's provided test silero model.
      USE_VAD=true; shift 1;;
    *)
      echo "Unknown arg: $1" >&2
      usage;;
  esac
done

if [[ ! -f "$IN" ]]; then
  echo "File not found: $IN" >&2
  exit 1
fi

if [[ ! -x "$WHISPER_CLI" ]]; then
  echo "Missing whisper-cli at $WHISPER_CLI" >&2
  exit 1
fi

if [[ ! -f "$MODEL" ]]; then
  echo "Model not found: $MODEL" >&2
  exit 1
fi

# Derive OUT
if [[ "$OUT" == "" ]]; then
  base="${IN%.*}"
  OUT="${base}.txt"
fi

mkdir -p "$(dirname "$OUT")"

TMP_WAV="$(mktemp -t oc_whispercpp_XXXXXX.wav)"
cleanup() { rm -f "$TMP_WAV"; }
trap cleanup EXIT

# Decode to wav (16k, mono)
ffmpeg -y -nostdin -hide_banner -loglevel error \
  -i "$IN" -ar 16000 -ac 1 -c:a pcm_s16le "$TMP_WAV"

# whisper.cpp writes outputs based on -of prefix
OUT_PREFIX="${OUT%.*}"

ARGS=("-m" "$MODEL" "-l" "$LANG" "-otxt" "-of" "$OUT_PREFIX")

$WANT_SRT && ARGS+=("-osrt")
$WANT_VTT && ARGS+=("-ovtt")
if $USE_VAD; then
  VAD_MODEL_DEFAULT="/home/kuhnn/whisper.cpp/models/for-tests-silero-v6.2.0-ggml.bin"
  if [[ -f "$VAD_MODEL_DEFAULT" ]]; then
    ARGS+=("--vad" "-vm" "$VAD_MODEL_DEFAULT")
  else
    # If no VAD model is present, fall back gracefully to non-VAD.
    echo "Warning: VAD model not found at $VAD_MODEL_DEFAULT; continuing without VAD" >&2
  fi
fi
[[ "$PROMPT" != "" ]] && ARGS+=("--prompt" "$PROMPT")

"$WHISPER_CLI" "${ARGS[@]}" "$TMP_WAV" >/dev/null

# Print the main txt output path
echo "${OUT_PREFIX}.txt"
