video-flow/modules/factory.py

"""
MatchMe Studio - Factory Module (Concurrent Scene Generation)
Using Volcengine (Doubao) API for Image and Video
"""
import os
import time
import logging
import requests
import json
import re
import base64
import subprocess
from pathlib import Path
from typing import Dict, Any, List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from elevenlabs import ElevenLabs, VoiceSettings
from openai import OpenAI

import config
from modules import storage

logger = logging.getLogger(__name__)

# Initialize OpenAI Client for Volcengine Image Generation
client = OpenAI(
    api_key=config.VOLC_API_KEY,
    base_url=config.VOLC_BASE_URL
)

# ============================================================
# Helper Functions
# ============================================================

def _download_as_base64(url: str) -> str:
    """Download image from URL and convert to Base64."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return base64.b64encode(response.content).decode('utf-8')
    except Exception as e:
        logger.error(f"Failed to download/encode image: {e}")
        return ""

# ============================================================
# Image Generation (Doubao / Volcengine)
# ============================================================

def generate_scene_image(
    scene: Dict[str, Any],
    brief: Dict[str, Any] = None,
    reference_images: List[str] = None
) -> str:
    """
    Generate image using Volcengine API (Doubao Image).
    Using raw requests to match user's curl example exactly.
    """
    # Build prompt
    image_prompt = scene.get("image_prompt", "")
    if not image_prompt:
        # Fallback prompt construction
        keyframe = scene.get("keyframe", {})
        # Stronger style consistency intro
        parts = ["Cinematic shot, 8k, photorealistic"]
        if brief:
            if brief.get("product_visual_description"):
                parts.append(f"Product: {brief['product_visual_description']}")
        parts.extend([
            f"Subject: {keyframe.get('subject', 'product')}",
            f"Environment: {keyframe.get('environment', 'studio')}",
            f"Action: {keyframe.get('focus', '')}"
        ])
        image_prompt = ", ".join(parts)

    # Append explicit consistency enforcement to prompt
    if brief and brief.get("product_visual_description"):
        if brief['product_visual_description'] not in image_prompt:
             image_prompt = f"{brief['product_visual_description']}, {image_prompt}"

    logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")

    url = f"{config.VOLC_BASE_URL}/images/generations"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {config.VOLC_API_KEY}"
    }

    # Payload matching user's curl example
    payload = {
        "model": config.IMAGE_MODEL_ID,
        "prompt": image_prompt,
        "sequential_image_generation": "disabled",
        "response_format": "b64_json", # Use base64 to avoid temp url expiration issues
        "size": "2K", # User specified 2K
        "stream": False,
        "watermark": True
    }

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=60)

        if response.status_code != 200:
            logger.error(f"Image API Error: {response.text}")
            raise ValueError(f"Image API failed: {response.status_code} - {response.text}")

        data = response.json()

        # Extract Image Data
        image_data = None
        if "data" in data and len(data["data"]) > 0:
             image_data = data["data"][0].get("b64_json")
             if not image_data:
                 # Fallback to URL download if b64 not present
                 img_url = data["data"][0].get("url")
                 if img_url:
                     # Download the image to ensure we have it locally
                     image_data = _download_as_base64(img_url)

        if not image_data:
            raise ValueError("No image data returned")

        # Decode and Save
        filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
        local_path = config.TEMP_DIR / filename

        with open(local_path, "wb") as f:
            f.write(base64.b64decode(image_data))

        # Upload to R2
        r2_url = storage.upload_file(str(local_path))
        logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
        return r2_url

    except Exception as e:
        logger.error(f"Image Generation Failed: {e}")
        raise


def generate_all_scene_images_concurrent(
    scenes: List[Dict[str, Any]],
    brief: Dict[str, Any] = None,
    reference_images: List[str] = None,
    max_workers: int = 3
) -> List[str]:
    """Generate images for all scenes concurrently."""
    logger.info(f"Generating {len(scenes)} images concurrently...")
    image_urls = [None] * len(scenes)

    def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
        url = generate_scene_image(scene, brief, reference_images)
        return index, url

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(generate_single, i, scene): i
            for i, scene in enumerate(scenes)
        }

        for future in as_completed(futures):
            index = futures[future]
            try:
                _, url = future.result()
                image_urls[index] = url
            except Exception as e:
                logger.error(f"Scene {index+1} failed: {e}")

    return image_urls


# ============================================================
# Video Generation (Doubao Video / PixelDance)
# ============================================================

def generate_scene_video(
    start_frame_url: str,
    motion_prompt: str,
    duration: int = 5
) -> str:
    """
    Generate video using Volcengine API (Async Task Flow).
    """
    logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")

    # 1. Create Task
    create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {config.VOLC_API_KEY}"
    }

    # Construct Content List (Text + Optional Image)
    content_list = [
        {
            "type": "text",
            "text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
        }
    ]

    if start_frame_url:
        content_list.append({
            "type": "image_url",
            "image_url": {"url": start_frame_url}
        })

    payload = {
        "model": config.VIDEO_MODEL_ID,
        "content": content_list
    }

    try:
        response = requests.post(create_url, headers=headers, json=payload, timeout=30)
        if response.status_code != 200:
            # 202 Accepted is also possible for async tasks
            if response.status_code != 202:
                 logger.error(f"Video Task Creation Error: {response.text}")
                 raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")

        data = response.json()
        task_id = data.get("id")
        if not task_id:
             # Sometimes ID is in data.id or similar
             task_id = data.get("data", {}).get("id")

        if not task_id:
            raise ValueError(f"No Task ID returned: {data}")

        logger.info(f"Video Task Created: {task_id}. Polling for result...")

        # 2. Poll for Result
        # GET /contents/generations/tasks/{id}
        max_retries = 60 # 5 mins max (5s interval)
        video_url = None

        for _ in range(max_retries):
            time.sleep(5)
            status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
            resp = requests.get(status_url, headers=headers, timeout=30)

            if resp.status_code == 200:
                res_data = resp.json()
                # Check status
                # Structure usually: data.status = "succeeded" / "running" / "failed"
                # Or top level status

                status = res_data.get("status")
                if not status and "data" in res_data:
                    status = res_data["data"].get("status")

                if status == "succeeded" or status == "SUCCEEDED":
                    # Extract URL
                    content = res_data.get("data", {}).get("content", [])
                    if not content and "content" in res_data:
                        content = res_data["content"]

                    # Find video url in content
                    # Content is usually list of dicts with type='video' or 'video_url'
                    for item in content:
                        if item.get("video_url"):
                            video_url = item["video_url"]
                            break
                        if item.get("url"): # sometimes just url
                             video_url = item["url"]
                             break

                    if video_url:
                        break
                elif status == "failed" or status == "FAILED":
                     reason = res_data.get("data", {}).get("error", "Unknown error")
                     raise ValueError(f"Video Generation Failed: {reason}")

                # If running/queued, continue waiting

        if not video_url:
            raise TimeoutError("Video generation timed out or failed to return URL.")

        # 3. Download and Upload to R2
        logger.info(f"Video Generated. Downloading: {video_url}")
        filename = f"vid_doubao_{int(time.time())}.mp4"
        local_path = config.TEMP_DIR / filename

        resp = requests.get(video_url, stream=True)
        if resp.status_code != 200:
            raise ValueError(f"Failed to download generated video: {resp.status_code}")

        with open(local_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)

        r2_url = storage.upload_file(str(local_path))
        return r2_url

    except Exception as e:
        logger.error(f"Video Generation Error: {e}")
        raise


def generate_all_scene_videos_concurrent(
    scenes: List[Dict[str, Any]],
    image_urls: List[str],
    max_workers: int = 2
) -> List[str]:
    """Generate videos concurrently."""
    logger.info(f"Generating {len(scenes)} videos concurrently...")
    video_urls = [None] * len(scenes)

    def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
        motion = scene.get("camera_movement", "slow zoom")
        if scene.get("image_prompt"):
             motion = f"{scene['image_prompt']}. {motion}"

        duration = scene.get("duration", 5)
        url = generate_scene_video(img_url, motion, duration)
        return index, url

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(generate_single, i, scene, image_urls[i]): i
            for i, scene in enumerate(scenes)
        }

        for future in as_completed(futures):
            index = futures[future]
            try:
                _, url = future.result()
                video_urls[index] = url
            except Exception as e:
                logger.error(f"Scene {index+1} video failed: {e}")

    return video_urls


# ============================================================
# Audio Generation (ElevenLabs)
# ============================================================

def generate_voiceover(text: str, style: str = "") -> str:
    """Generate voiceover audio. Returns R2 URL."""
    if not text or not text.strip():
        return ""

    stability = 0.3 if "ASMR" in style else 0.5
    similarity = 0.9 if "ASMR" in style else 0.8

    logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")

    try:
        el_client = ElevenLabs(api_key=config.XI_KEY)

        audio_stream = el_client.text_to_speech.convert(
            voice_id=config.ELEVENLABS_VOICE_ID,
            text=text,
            model_id=config.ELEVENLABS_MODEL,
            voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
        )

        filename = f"vo_{int(time.time())}.mp3"
        local_path = config.TEMP_DIR / filename

        with open(local_path, "wb") as f:
            for chunk in audio_stream:
                f.write(chunk)

        r2_url = storage.upload_file(str(local_path))
        return r2_url
    except Exception as e:
        logger.error(f"Voiceover failed: {e}")
        return ""


def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
    """Generate combined voiceover for all scenes."""
    voiceovers = []
    for s in scenes:
        vo = s.get("voiceover", "")
        if vo and vo.strip() and not vo.startswith("("):
            voiceovers.append(vo.strip())

    if not voiceovers:
        return ""

    full_text = " ".join(voiceovers)
    return generate_voiceover(full_text, style)


# ============================================================
# Audio Generation (Edge TTS - 免费中文语音合成)
# ============================================================

# Edge TTS 中文音色预设 (免费，效果好)
EDGE_TTS_VOICES = {
    # 女声
    "sweet_female": "zh-CN-XiaoxiaoNeural",      # 晓晓 - 甜美活泼（推荐）
    "gentle_female": "zh-CN-XiaoyiNeural",       # 晓伊 - 温柔知性
    "lively_female": "zh-CN-XiaochenNeural",     # 晓辰 - 活泼可爱
    "broadcast_female": "zh-CN-XiaoqiuNeural",   # 晓秋 - 新闻播报
    # 男声
    "general_male": "zh-CN-YunxiNeural",         # 云希 - 温暖男声
    "broadcast_male": "zh-CN-YunjianNeural",     # 云健 - 专业播报
}

# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
VOLC_TTS_VOICES = {
    # 抖音带货友好女声
    "sweet_female": "zh_female_vv_uranus_bigtts",   # viv 2.0 通用女声（甜美）
    "lively_female": "zh_female_jitangnv_saturn_bigtts",  # 鸡汤女（元气）
    "broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘（新闻播报）- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
    "meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
    # 男声
    "general_male": "zh_male_dayi_saturn_bigtts",   # 大义（沉稳男声）
}


def generate_voiceover_edge(
    text: str,
    voice_type: str = "sweet_female",
    rate: str = "+0%",
    volume: str = "+0%",
    output_path: str = None
) -> str:
    """
    使用 Edge TTS 生成中文旁白（免费，效果好）

    Args:
        text: 旁白文本
        voice_type: 音色类型（见 EDGE_TTS_VOICES）或直接使用音色名
        rate: 语速调整，如 "+10%", "-20%"
        volume: 音量调整，如 "+10%", "-20%"
        output_path: 输出路径

    Returns:
        音频文件路径
    """
    import asyncio
    import edge_tts

    if not text or not text.strip():
        logger.warning("Empty text provided for TTS")
        return ""

    # 获取音色
    voice = EDGE_TTS_VOICES.get(voice_type, voice_type)

    logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")

    if not output_path:
        filename = f"vo_edge_{int(time.time())}.mp3"
        output_path = str(config.TEMP_DIR / filename)

    async def _generate():
        communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
        await communicate.save(output_path)

    # Simple retry logic for Edge TTS
    max_retries = 3
    for i in range(max_retries):
        try:
            asyncio.run(_generate())
            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                logger.info(f"Edge TTS voiceover generated: {output_path}")
                return output_path
        except Exception as e:
            logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
            time.sleep(1.0) # wait before retry

    logger.error("Edge TTS failed after retries.")
    return ""


def generate_voiceover_volcengine_ws(
    text: str,
    voice_type: str = "sweet_female",
    output_path: str = None,
    timeout: int = 120
) -> str:
    """
    使用火山 WebSocket Binary Demo 生成 TTS 音频
    依赖目录：{PROJECT_ROOT}/volcengine_binary_demo/.venv/bin/python
    """
    if not text or not text.strip():
        logger.warning("Empty text provided for TTS (ws)")
        return ""

    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)

    # 跨平台路径：使用项目根目录相对路径
    volc_demo_dir = config.BASE_DIR / "volcengine_binary_demo"
    venv_python = volc_demo_dir / ".venv" / "bin" / "python"
    demo_script = volc_demo_dir / "examples" / "volcengine" / "binary.py"

    if not venv_python.exists() or not demo_script.exists():
        logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
        return ""

    if not output_path:
        output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")

    cmd = [
        str(venv_python),
        str(demo_script),
        "--appid", config.VOLC_TTS_APPID,
        "--access_token", config.VOLC_TTS_ACCESS_TOKEN,
        "--voice_type", voice_id,
        "--text", text,
        "--encoding", "mp3",
    ]

    logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
    try:
        result = subprocess.run(
            cmd,
            cwd=str(volc_demo_dir),
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        if result.returncode != 0:
            logger.error(f"Volc WS TTS failed: {result.stderr}")
            return ""

        # demo 保存在 cwd 下 voice_type.mp3
        demo_out = volc_demo_dir / f"{voice_id}.mp3"
        if not demo_out.exists():
            logger.error("Volc WS TTS output not found")
            return ""

        Path(output_path).write_bytes(demo_out.read_bytes())
        logger.info(f"Volc WS TTS saved to {output_path}")
        return output_path
    except Exception as e:
        logger.error(f"Volc WS TTS error: {e}")
        return ""


def generate_voiceover_volcengine(
    text: str,
    voice_type: str = "sweet_female",
    speed_ratio: float = 1.0,
    volume_ratio: float = 1.0,
    pitch_ratio: float = 1.0,
    output_path: str = None
) -> str:
    """
    使用火山引擎 TTS 生成中文旁白

    Args:
        text: 旁白文本
        voice_type: 音色类型（见 VOLC_TTS_VOICES）或直接使用音色 ID
        speed_ratio: 语速（0.5-2.0，默认1.0）
        volume_ratio: 音量（0.5-2.0，默认1.0）
        pitch_ratio: 音调（0.5-2.0，默认1.0）
        output_path: 输出路径（可选，默认自动生成）

    Returns:
        音频文件路径
    """
    import uuid

    if not text or not text.strip():
        logger.warning("Empty text provided for TTS")
        return ""

    # 获取音色 ID（火山音色表 + fallback 自定义）
    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)

    logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")

    # 先尝试 WebSocket Binary（官方 demo 已验证可用）
    ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
    if ws_path:
        return ws_path

    # 若 WS 异常，再尝试 HTTP
    url = "https://openspeech.bytedance.com/api/v1/tts"

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
    }

    payload = {
        "app": {
            "appid": config.VOLC_TTS_APPID,
            "token": config.VOLC_TTS_ACCESS_TOKEN,
            "cluster": "volcano_tts"
        },
        "user": {
            "uid": "video_flow_user"
        },
        "audio": {
            "voice_type": voice_id,
            "encoding": "mp3",
            "speed_ratio": speed_ratio,
            "volume_ratio": volume_ratio,
            "pitch_ratio": pitch_ratio
        },
        "request": {
            "reqid": str(uuid.uuid4()),
            "text": text,
            "text_type": "plain",
            "operation": "query",
            "with_timestamp": "1",
            "extra_param": json.dumps({
                "disable_markdown_filter": False
            })
        }
    }

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=60)

        if response.status_code != 200:
            logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
            # Fallback to Edge TTS with a safe default voice
            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)

        data = response.json()

        ret_code = data.get("code")
        if ret_code not in (0, 3000, 20000000):
            error_msg = data.get("message", "Unknown error")
            logger.error(f"Volcengine TTS Error: {error_msg}")
            # Fallback to Edge TTS with a safe default voice
            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)

        audio_data = data.get("data", "")
        if not audio_data:
            raise ValueError("No audio data returned")

        if not output_path:
            filename = f"vo_volc_{int(time.time())}.mp3"
            output_path = str(config.TEMP_DIR / filename)

        with open(output_path, "wb") as f:
            f.write(base64.b64decode(audio_data))

        logger.info(f"Voiceover generated (HTTP): {output_path}")
        return output_path

    except Exception as e:
        logger.error(f"Volcengine TTS HTTP error: {e}")
        # Fallback to Edge TTS with a safe default voice
        fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
        return generate_voiceover_edge(text, fallback_voice, output_path=output_path)


def generate_voiceover_volcengine_long(
    text: str,
    voice_type: str = "sweet_female",
    speed_ratio: float = 1.0,
    output_path: str = None,
    max_chunk_length: int = 300
) -> str:
    """
    火山引擎 TTS 长文本处理（自动分段合成）

    对于超过 max_chunk_length 的文本，自动分段合成后拼接
    """
    if len(text) <= max_chunk_length:
        return generate_voiceover_volcengine(
            text=text,
            voice_type=voice_type,
            speed_ratio=speed_ratio,
            output_path=output_path
        )

    logger.info(f"Long text ({len(text)} chars), splitting into chunks...")

    # 按句子分段
    import re
    sentences = re.split(r'([。！？；.!?;])', text)

    chunks = []
    current_chunk = ""

    for i in range(0, len(sentences) - 1, 2):
        sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")

        if len(current_chunk) + len(sentence) <= max_chunk_length:
            current_chunk += sentence
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk)

    # 如果最后一段是奇数句子
    if len(sentences) % 2 == 1 and sentences[-1]:
        if chunks:
            chunks[-1] += sentences[-1]
        else:
            chunks.append(sentences[-1])

    logger.info(f"Split into {len(chunks)} chunks")

    # 生成每段音频
    chunk_files = []
    for i, chunk in enumerate(chunks):
        chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
        try:
            path = generate_voiceover_volcengine(
                text=chunk,
                voice_type=voice_type,
                speed_ratio=speed_ratio,
                output_path=chunk_path
            )
            chunk_files.append(path)
        except Exception as e:
            logger.error(f"Chunk {i} failed: {e}")
            # 继续处理其他段落

    if not chunk_files:
        raise ValueError("All TTS chunks failed")

    # 使用 FFmpeg 合并音频
    if len(chunk_files) == 1:
        if output_path:
            import shutil
            shutil.move(chunk_files[0], output_path)
            return output_path
        return chunk_files[0]

    # 创建合并文件列表
    concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
    with open(concat_list, "w") as f:
        for cf in chunk_files:
            f.write(f"file '{cf}'\n")

    if not output_path:
        output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")

    # FFmpeg 合并
    import subprocess
    cmd = [
        "ffmpeg", "-y",
        "-f", "concat",
        "-safe", "0",
        "-i", str(concat_list),
        "-c", "copy",
        output_path
    ]

    subprocess.run(cmd, capture_output=True, check=True)

    # 清理临时文件
    for cf in chunk_files:
        try:
            os.remove(cf)
        except:
            pass
    concat_list.unlink(missing_ok=True)

    logger.info(f"Merged voiceover: {output_path}")
    return output_path


def generate_scene_voiceovers_volcengine(
    scenes: List[Dict[str, Any]],
    voice_type: str = "sweet_female",
    output_dir: str = None
) -> List[str]:
    """
    为每个场景单独生成旁白音频

    Args:
        scenes: 场景列表，每个场景包含 voiceover 字段
        voice_type: 音色类型
        output_dir: 输出目录

    Returns:
        音频文件路径列表
    """
    if output_dir:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
    else:
        output_dir = config.TEMP_DIR

    audio_paths = []

    for i, scene in enumerate(scenes):
        vo_text = scene.get("voiceover", "")

        if not vo_text or not vo_text.strip() or vo_text.startswith("("):
            # 无旁白或是注释
            audio_paths.append("")
            continue

        try:
            output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
            path = generate_voiceover_volcengine(
                text=vo_text.strip(),
                voice_type=voice_type,
                output_path=output_path
            )
            audio_paths.append(path)
        except Exception as e:
            logger.error(f"Scene {i+1} voiceover failed: {e}")
            audio_paths.append("")

    return audio_paths