"""Transcription engine: OpenAI GPT-4o Transcribe."""
import collections
import importlib.util
import os
import re
import tempfile
import threading
from meet_assistant import BASE_DIR
# audioop and speech_recognition are only needed for the server-side PulseAudio
# capture path (Linux workstation install). Browser capture passes WAV files
# directly, so these are deferred imports.
# ── Hallucination filter (GPT-4o transcribe ghosts on silent audio) ────────────
# Allow Latin + extended Latin scripts (covers all 10 supported European languages).
_LATIN_RE = re.compile(r'^[a-zA-Z\u00c0-\u024f\u1e00-\u1eff0-9\s\.\,\!\?\;\:\-\'\"\(\)\[\]\{\}\@\#\%\&\*\/\\\_\+\=\~\`\^\<\>\|]+$')
# Low-content single words / YouTube-style closing phrases that the model
# hallucinates on silent or music-only audio, regardless of language.
_HALLUCINATIONS = {
"context:", "context",
"thank you.", "thanks for watching.", "subtitle by", "subtitles by",
"subscribe", "you", "the", "i",
"...", ".", "", "bye.", "bye", "thank you", "thanks",
}
def _is_hallucination(text):
t = text.strip()
if len(t) < 3:
return True
if not _LATIN_RE.match(t):
return True
low = t.lower()
if low in _HALLUCINATIONS:
return True
if low.startswith("context"):
return True
return False
# ── Rolling prompt context ─────────────────────────────────────────────────────
# Keep the most recent successful transcriptions; pass them as the `prompt`
# argument on each call so the model stays consistent with names, jargon, and
# speaking style. OpenAI caps the prompt at 244 tokens — we keep ~400 chars.
_CONTEXT_MAX_CHARS = 400
_context = collections.deque(maxlen=10)
_context_lock = threading.Lock()
def _build_prompt(language):
lang_hint = f"Speech in {language}." if language else ""
with _context_lock:
recent = " ".join(_context)
if not recent:
return lang_hint or None
# Tail-truncate to fit budget
if len(recent) > _CONTEXT_MAX_CHARS:
recent = recent[-_CONTEXT_MAX_CHARS:]
return (lang_hint + " " + recent).strip() if lang_hint else recent
def _remember(text):
with _context_lock:
_context.append(text)
def reset_context():
"""Clear the rolling transcript context (called when user hits Clear)."""
with _context_lock:
_context.clear()
# ── Key resolution: env var → secure/ file ─────────────────────────────────────
def _read_key():
"""Look up the OpenAI key from env var (OPENAI_API_KEY) or secure/chatgpt."""
val = os.environ.get("OPENAI_API_KEY")
if val:
return val.strip()
path = os.path.join(BASE_DIR, "secure", "chatgpt")
if os.path.isfile(path):
return open(path).read().strip()
return None
# ── Engine detection ───────────────────────────────────────────────────────────
AVAILABLE_ENGINES = {}
if importlib.util.find_spec("openai") and _read_key():
AVAILABLE_ENGINES["openai"] = "GPT-4o Transcribe"
# ── Engine class ───────────────────────────────────────────────────────────────
class OpenAIEngine:
def __init__(self):
from openai import OpenAI
key = _read_key()
if not key:
raise RuntimeError(
"No OpenAI API key found. "
"Set OPENAI_API_KEY in .env (see .env.example), "
"or save your key to secure/chatgpt"
)
self.client = OpenAI(api_key=key)
def transcribe_audio(self, audio_data, language=None):
import audioop
import speech_recognition as sr
raw = audio_data.get_raw_data()
boosted = audioop.mul(raw, audio_data.sample_width, 3.0)
audio_data = sr.AudioData(boosted, audio_data.sample_rate, audio_data.sample_width)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data.get_wav_data())
p = f.name
try:
return self._do(p, language)
finally:
os.unlink(p)
def transcribe_wav(self, path, language=None):
return self._do(path, language)
def _do(self, path, language):
kwargs = {"model": "gpt-4o-transcribe"}
if language:
kwargs["language"] = language
prompt = _build_prompt(language)
if prompt:
kwargs["prompt"] = prompt
with open(path, "rb") as f:
r = self.client.audio.transcriptions.create(file=f, **kwargs)
text = r.text.strip() if r.text else None
if not text or _is_hallucination(text):
return None, None
_remember(text)
return text, language
ENGINE_CLASSES = {
"openai": OpenAIEngine,
}