video-flow/modules/asr.py

"""
MatchMe Studio - ASR Module (Whisper via ShuBiaoBiao)
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
from openai import OpenAI

import config

logger = logging.getLogger(__name__)

client = OpenAI(
    api_key=config.SHUBIAOBIAO_KEY,
    base_url=config.SHUBIAOBIAO_BASE_URL
)


def extract_audio_from_video(video_path: str) -> str:
    """Extract audio track from video using ffmpeg."""
    video_path = Path(video_path)
    audio_path = config.TEMP_DIR / f"{video_path.stem}_audio.mp3"

    cmd = [
        "ffmpeg", "-y",
        "-i", str(video_path),
        "-vn",  # No video
        "-acodec", "libmp3lame",
        "-ar", "16000",  # 16kHz for Whisper
        "-ac", "1",  # Mono
        str(audio_path)
    ]

    try:
        subprocess.run(cmd, check=True, capture_output=True)
        logger.info(f"Audio extracted to {audio_path}")
        return str(audio_path)
    except subprocess.CalledProcessError as e:
        logger.error(f"FFmpeg error: {e.stderr.decode()}")
        raise RuntimeError("Failed to extract audio from video")


def transcribe(audio_path: str) -> str:
    """Transcribe audio to text using Whisper API."""
    logger.info(f"Transcribing {audio_path}...")

    try:
        with open(audio_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language="zh",  # Chinese
                response_format="text"
            )

        text = response if isinstance(response, str) else response.text
        logger.info(f"Transcription complete: {len(text)} chars")
        return text

    except Exception as e:
        logger.error(f"Whisper API error: {e}")
        raise


def transcribe_video(video_path: str) -> str:
    """Extract audio from video and transcribe."""
    audio_path = extract_audio_from_video(video_path)
    return transcribe(audio_path)