feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions
--- a/modules/factory.py
+++ b/modules/factory.py
@@ -0,0 +1,801 @@
+"""
+MatchMe Studio - Factory Module (Concurrent Scene Generation)
+Using Volcengine (Doubao) API for Image and Video
+"""
+import os
+import time
+import logging
+import requests
+import json
+import re
+import base64
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from elevenlabs import ElevenLabs, VoiceSettings
+from openai import OpenAI
+
+import config
+from modules import storage
+
+logger = logging.getLogger(__name__)
+
+# Initialize OpenAI Client for Volcengine Image Generation
+client = OpenAI(
+    api_key=config.VOLC_API_KEY,
+    base_url=config.VOLC_BASE_URL
+)
+
+# ============================================================
+# Helper Functions
+# ============================================================
+
+def _download_as_base64(url: str) -> str:
+    """Download image from URL and convert to Base64."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return base64.b64encode(response.content).decode('utf-8')
+    except Exception as e:
+        logger.error(f"Failed to download/encode image: {e}")
+        return ""
+
+# ============================================================
+# Image Generation (Doubao / Volcengine)
+# ============================================================
+
+def generate_scene_image(
+    scene: Dict[str, Any],
+    brief: Dict[str, Any] = None,
+    reference_images: List[str] = None
+) -> str:
+    """
+    Generate image using Volcengine API (Doubao Image).
+    Using raw requests to match user's curl example exactly.
+    """
+    # Build prompt
+    image_prompt = scene.get("image_prompt", "")
+    if not image_prompt:
+        # Fallback prompt construction
+        keyframe = scene.get("keyframe", {})
+        # Stronger style consistency intro
+        parts = ["Cinematic shot, 8k, photorealistic"]
+        if brief:
+            if brief.get("product_visual_description"):
+                parts.append(f"Product: {brief['product_visual_description']}")
+        parts.extend([
+            f"Subject: {keyframe.get('subject', 'product')}",
+            f"Environment: {keyframe.get('environment', 'studio')}",
+            f"Action: {keyframe.get('focus', '')}"
+        ])
+        image_prompt = ", ".join(parts)
+    
+    # Append explicit consistency enforcement to prompt
+    if brief and brief.get("product_visual_description"):
+        if brief['product_visual_description'] not in image_prompt:
+             image_prompt = f"{brief['product_visual_description']}, {image_prompt}"
+
+    logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")
+
+    url = f"{config.VOLC_BASE_URL}/images/generations"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.VOLC_API_KEY}"
+    }
+    
+    # Payload matching user's curl example
+    payload = {
+        "model": config.IMAGE_MODEL_ID,
+        "prompt": image_prompt,
+        "sequential_image_generation": "disabled",
+        "response_format": "b64_json", # Use base64 to avoid temp url expiration issues
+        "size": "2K", # User specified 2K
+        "stream": False,
+        "watermark": True
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=60)
+        
+        if response.status_code != 200:
+            logger.error(f"Image API Error: {response.text}")
+            raise ValueError(f"Image API failed: {response.status_code} - {response.text}")
+            
+        data = response.json()
+        
+        # Extract Image Data
+        image_data = None
+        if "data" in data and len(data["data"]) > 0:
+             image_data = data["data"][0].get("b64_json")
+             if not image_data:
+                 # Fallback to URL download if b64 not present
+                 img_url = data["data"][0].get("url")
+                 if img_url:
+                     # Download the image to ensure we have it locally
+                     image_data = _download_as_base64(img_url)
+
+        if not image_data:
+            raise ValueError("No image data returned")
+
+        # Decode and Save
+        filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
+        local_path = config.TEMP_DIR / filename
+        
+        with open(local_path, "wb") as f:
+            f.write(base64.b64decode(image_data))
+        
+        # Upload to R2
+        r2_url = storage.upload_file(str(local_path))
+        logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
+        return r2_url
+
+    except Exception as e:
+        logger.error(f"Image Generation Failed: {e}")
+        raise
+
+
+def generate_all_scene_images_concurrent(
+    scenes: List[Dict[str, Any]],
+    brief: Dict[str, Any] = None,
+    reference_images: List[str] = None,
+    max_workers: int = 3
+) -> List[str]:
+    """Generate images for all scenes concurrently."""
+    logger.info(f"Generating {len(scenes)} images concurrently...")
+    image_urls = [None] * len(scenes)
+    
+    def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
+        url = generate_scene_image(scene, brief, reference_images)
+        return index, url
+    
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(generate_single, i, scene): i 
+            for i, scene in enumerate(scenes)
+        }
+        
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                _, url = future.result()
+                image_urls[index] = url
+            except Exception as e:
+                logger.error(f"Scene {index+1} failed: {e}")
+    
+    return image_urls
+
+
+# ============================================================
+# Video Generation (Doubao Video / PixelDance)
+# ============================================================
+
+def generate_scene_video(
+    start_frame_url: str,
+    motion_prompt: str,
+    duration: int = 5
+) -> str:
+    """
+    Generate video using Volcengine API (Async Task Flow).
+    """
+    logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")
+
+    # 1. Create Task
+    create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.VOLC_API_KEY}"
+    }
+
+    # Construct Content List (Text + Optional Image)
+    content_list = [
+        {
+            "type": "text", 
+            "text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
+        }
+    ]
+    
+    if start_frame_url:
+        content_list.append({
+            "type": "image_url",
+            "image_url": {"url": start_frame_url}
+        })
+
+    payload = {
+        "model": config.VIDEO_MODEL_ID,
+        "content": content_list
+    }
+
+    try:
+        response = requests.post(create_url, headers=headers, json=payload, timeout=30)
+        if response.status_code != 200:
+            # 202 Accepted is also possible for async tasks
+            if response.status_code != 202: 
+                 logger.error(f"Video Task Creation Error: {response.text}")
+                 raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")
+        
+        data = response.json()
+        task_id = data.get("id")
+        if not task_id:
+             # Sometimes ID is in data.id or similar
+             task_id = data.get("data", {}).get("id")
+        
+        if not task_id:
+            raise ValueError(f"No Task ID returned: {data}")
+
+        logger.info(f"Video Task Created: {task_id}. Polling for result...")
+
+        # 2. Poll for Result
+        # GET /contents/generations/tasks/{id}
+        max_retries = 60 # 5 mins max (5s interval)
+        video_url = None
+        
+        for _ in range(max_retries):
+            time.sleep(5)
+            status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
+            resp = requests.get(status_url, headers=headers, timeout=30)
+            
+            if resp.status_code == 200:
+                res_data = resp.json()
+                # Check status
+                # Structure usually: data.status = "succeeded" / "running" / "failed"
+                # Or top level status
+                
+                status = res_data.get("status")
+                if not status and "data" in res_data:
+                    status = res_data["data"].get("status")
+                
+                if status == "succeeded" or status == "SUCCEEDED":
+                    # Extract URL
+                    content = res_data.get("data", {}).get("content", [])
+                    if not content and "content" in res_data:
+                        content = res_data["content"]
+                        
+                    # Find video url in content
+                    # Content is usually list of dicts with type='video' or 'video_url'
+                    for item in content:
+                        if item.get("video_url"):
+                            video_url = item["video_url"]
+                            break
+                        if item.get("url"): # sometimes just url
+                             video_url = item["url"]
+                             break
+                    
+                    if video_url:
+                        break
+                elif status == "failed" or status == "FAILED":
+                     reason = res_data.get("data", {}).get("error", "Unknown error")
+                     raise ValueError(f"Video Generation Failed: {reason}")
+                
+                # If running/queued, continue waiting
+            
+        if not video_url:
+            raise TimeoutError("Video generation timed out or failed to return URL.")
+
+        # 3. Download and Upload to R2
+        logger.info(f"Video Generated. Downloading: {video_url}")
+        filename = f"vid_doubao_{int(time.time())}.mp4"
+        local_path = config.TEMP_DIR / filename
+        
+        resp = requests.get(video_url, stream=True)
+        if resp.status_code != 200:
+            raise ValueError(f"Failed to download generated video: {resp.status_code}")
+            
+        with open(local_path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
+        
+        r2_url = storage.upload_file(str(local_path))
+        return r2_url
+
+    except Exception as e:
+        logger.error(f"Video Generation Error: {e}")
+        raise
+
+
+def generate_all_scene_videos_concurrent(
+    scenes: List[Dict[str, Any]],
+    image_urls: List[str],
+    max_workers: int = 2
+) -> List[str]:
+    """Generate videos concurrently."""
+    logger.info(f"Generating {len(scenes)} videos concurrently...")
+    video_urls = [None] * len(scenes)
+    
+    def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
+        motion = scene.get("camera_movement", "slow zoom")
+        if scene.get("image_prompt"):
+             motion = f"{scene['image_prompt']}. {motion}"
+             
+        duration = scene.get("duration", 5)
+        url = generate_scene_video(img_url, motion, duration)
+        return index, url
+    
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(generate_single, i, scene, image_urls[i]): i
+            for i, scene in enumerate(scenes)
+        }
+        
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                _, url = future.result()
+                video_urls[index] = url
+            except Exception as e:
+                logger.error(f"Scene {index+1} video failed: {e}")
+    
+    return video_urls
+
+
+# ============================================================
+# Audio Generation (ElevenLabs)
+# ============================================================
+
+def generate_voiceover(text: str, style: str = "") -> str:
+    """Generate voiceover audio. Returns R2 URL."""
+    if not text or not text.strip():
+        return ""
+    
+    stability = 0.3 if "ASMR" in style else 0.5
+    similarity = 0.9 if "ASMR" in style else 0.8
+        
+    logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")
+    
+    try:
+        el_client = ElevenLabs(api_key=config.XI_KEY)
+        
+        audio_stream = el_client.text_to_speech.convert(
+            voice_id=config.ELEVENLABS_VOICE_ID,
+            text=text,
+            model_id=config.ELEVENLABS_MODEL,
+            voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
+        )
+        
+        filename = f"vo_{int(time.time())}.mp3"
+        local_path = config.TEMP_DIR / filename
+        
+        with open(local_path, "wb") as f:
+            for chunk in audio_stream:
+                f.write(chunk)
+        
+        r2_url = storage.upload_file(str(local_path))
+        return r2_url
+    except Exception as e:
+        logger.error(f"Voiceover failed: {e}")
+        return ""
+
+
+def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
+    """Generate combined voiceover for all scenes."""
+    voiceovers = []
+    for s in scenes:
+        vo = s.get("voiceover", "")
+        if vo and vo.strip() and not vo.startswith("("):
+            voiceovers.append(vo.strip())
+    
+    if not voiceovers:
+        return ""
+    
+    full_text = " ".join(voiceovers)
+    return generate_voiceover(full_text, style)
+
+
+# ============================================================
+# Audio Generation (Edge TTS - 免费中文语音合成)
+# ============================================================
+
+# Edge TTS 中文音色预设 (免费，效果好)
+EDGE_TTS_VOICES = {
+    # 女声
+    "sweet_female": "zh-CN-XiaoxiaoNeural",      # 晓晓 - 甜美活泼（推荐）
+    "gentle_female": "zh-CN-XiaoyiNeural",       # 晓伊 - 温柔知性
+    "lively_female": "zh-CN-XiaochenNeural",     # 晓辰 - 活泼可爱
+    "broadcast_female": "zh-CN-XiaoqiuNeural",   # 晓秋 - 新闻播报
+    # 男声
+    "general_male": "zh-CN-YunxiNeural",         # 云希 - 温暖男声
+    "broadcast_male": "zh-CN-YunjianNeural",     # 云健 - 专业播报
+}
+
+# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
+VOLC_TTS_VOICES = {
+    # 抖音带货友好女声
+    "sweet_female": "zh_female_vv_uranus_bigtts",   # viv 2.0 通用女声（甜美）
+    "lively_female": "zh_female_jitangnv_saturn_bigtts",  # 鸡汤女（元气）
+    "broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘（新闻播报）- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
+    "meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
+    # 男声
+    "general_male": "zh_male_dayi_saturn_bigtts",   # 大义（沉稳男声）
+}
+
+
+def generate_voiceover_edge(
+    text: str,
+    voice_type: str = "sweet_female",
+    rate: str = "+0%",
+    volume: str = "+0%",
+    output_path: str = None
+) -> str:
+    """
+    使用 Edge TTS 生成中文旁白（免费，效果好）
+    
+    Args:
+        text: 旁白文本
+        voice_type: 音色类型（见 EDGE_TTS_VOICES）或直接使用音色名
+        rate: 语速调整，如 "+10%", "-20%"
+        volume: 音量调整，如 "+10%", "-20%"
+        output_path: 输出路径
+    
+    Returns:
+        音频文件路径
+    """
+    import asyncio
+    import edge_tts
+    
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS")
+        return ""
+    
+    # 获取音色
+    voice = EDGE_TTS_VOICES.get(voice_type, voice_type)
+    
+    logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")
+    
+    if not output_path:
+        filename = f"vo_edge_{int(time.time())}.mp3"
+        output_path = str(config.TEMP_DIR / filename)
+    
+    async def _generate():
+        communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
+        await communicate.save(output_path)
+    
+    # Simple retry logic for Edge TTS
+    max_retries = 3
+    for i in range(max_retries):
+        try:
+            asyncio.run(_generate())
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                logger.info(f"Edge TTS voiceover generated: {output_path}")
+                return output_path
+        except Exception as e:
+            logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
+            time.sleep(1.0) # wait before retry
+            
+    logger.error("Edge TTS failed after retries.")
+    return ""
+
+
+def generate_voiceover_volcengine_ws(
+    text: str,
+    voice_type: str = "sweet_female",
+    output_path: str = None,
+    timeout: int = 120
+) -> str:
+    """
+    使用火山 WebSocket Binary Demo 生成 TTS 音频
+    依赖目录：/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python
+    """
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS (ws)")
+        return ""
+
+    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
+
+    venv_python = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python")
+    demo_script = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/examples/volcengine/binary.py")
+
+    if not venv_python.exists() or not demo_script.exists():
+        logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
+        return ""
+
+    if not output_path:
+        output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")
+
+    cmd = [
+        str(venv_python),
+        str(demo_script),
+        "--appid", config.VOLC_TTS_APPID,
+        "--access_token", config.VOLC_TTS_ACCESS_TOKEN,
+        "--voice_type", voice_id,
+        "--text", text,
+        "--encoding", "mp3",
+    ]
+
+    logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd="/Volumes/Tony/video-flow/volcengine_binary_demo",
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        if result.returncode != 0:
+            logger.error(f"Volc WS TTS failed: {result.stderr}")
+            return ""
+
+        # demo 保存在 cwd 下 voice_type.mp3
+        demo_out = Path("/Volumes/Tony/video-flow/volcengine_binary_demo") / f"{voice_id}.mp3"
+        if not demo_out.exists():
+            logger.error("Volc WS TTS output not found")
+            return ""
+
+        Path(output_path).write_bytes(demo_out.read_bytes())
+        logger.info(f"Volc WS TTS saved to {output_path}")
+        return output_path
+    except Exception as e:
+        logger.error(f"Volc WS TTS error: {e}")
+        return ""
+
+
+def generate_voiceover_volcengine(
+    text: str,
+    voice_type: str = "sweet_female",
+    speed_ratio: float = 1.0,
+    volume_ratio: float = 1.0,
+    pitch_ratio: float = 1.0,
+    output_path: str = None
+) -> str:
+    """
+    使用火山引擎 TTS 生成中文旁白
+    
+    Args:
+        text: 旁白文本
+        voice_type: 音色类型（见 VOLC_TTS_VOICES）或直接使用音色 ID
+        speed_ratio: 语速（0.5-2.0，默认1.0）
+        volume_ratio: 音量（0.5-2.0，默认1.0）
+        pitch_ratio: 音调（0.5-2.0，默认1.0）
+        output_path: 输出路径（可选，默认自动生成）
+    
+    Returns:
+        音频文件路径
+    """
+    import uuid
+    
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS")
+        return ""
+    
+    # 获取音色 ID（火山音色表 + fallback 自定义）
+    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
+    
+    logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")
+    
+    # 先尝试 WebSocket Binary（官方 demo 已验证可用）
+    ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
+    if ws_path:
+        return ws_path
+
+    # 若 WS 异常，再尝试 HTTP
+    url = "https://openspeech.bytedance.com/api/v1/tts"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
+    }
+
+    payload = {
+        "app": {
+            "appid": config.VOLC_TTS_APPID,
+            "token": config.VOLC_TTS_ACCESS_TOKEN,
+            "cluster": "volcano_tts"
+        },
+        "user": {
+            "uid": "video_flow_user"
+        },
+        "audio": {
+            "voice_type": voice_id,
+            "encoding": "mp3",
+            "speed_ratio": speed_ratio,
+            "volume_ratio": volume_ratio,
+            "pitch_ratio": pitch_ratio
+        },
+        "request": {
+            "reqid": str(uuid.uuid4()),
+            "text": text,
+            "text_type": "plain",
+            "operation": "query",
+            "with_timestamp": "1",
+            "extra_param": json.dumps({
+                "disable_markdown_filter": False
+            })
+        }
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=60)
+        
+        if response.status_code != 200:
+            logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
+            # Fallback to Edge TTS with a safe default voice
+            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+            
+        data = response.json()
+        
+        ret_code = data.get("code")
+        if ret_code not in (0, 3000, 20000000):
+            error_msg = data.get("message", "Unknown error")
+            logger.error(f"Volcengine TTS Error: {error_msg}")
+            # Fallback to Edge TTS with a safe default voice
+            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+        
+        audio_data = data.get("data", "")
+        if not audio_data:
+            raise ValueError("No audio data returned")
+        
+        if not output_path:
+            filename = f"vo_volc_{int(time.time())}.mp3"
+            output_path = str(config.TEMP_DIR / filename)
+        
+        with open(output_path, "wb") as f:
+            f.write(base64.b64decode(audio_data))
+        
+        logger.info(f"Voiceover generated (HTTP): {output_path}")
+        return output_path
+        
+    except Exception as e:
+        logger.error(f"Volcengine TTS HTTP error: {e}")
+        # Fallback to Edge TTS with a safe default voice
+        fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+        return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+
+
+def generate_voiceover_volcengine_long(
+    text: str,
+    voice_type: str = "sweet_female",
+    speed_ratio: float = 1.0,
+    output_path: str = None,
+    max_chunk_length: int = 300
+) -> str:
+    """
+    火山引擎 TTS 长文本处理（自动分段合成）
+    
+    对于超过 max_chunk_length 的文本，自动分段合成后拼接
+    """
+    if len(text) <= max_chunk_length:
+        return generate_voiceover_volcengine(
+            text=text,
+            voice_type=voice_type,
+            speed_ratio=speed_ratio,
+            output_path=output_path
+        )
+    
+    logger.info(f"Long text ({len(text)} chars), splitting into chunks...")
+    
+    # 按句子分段
+    import re
+    sentences = re.split(r'([。！？；.!?;])', text)
+    
+    chunks = []
+    current_chunk = ""
+    
+    for i in range(0, len(sentences) - 1, 2):
+        sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
+        
+        if len(current_chunk) + len(sentence) <= max_chunk_length:
+            current_chunk += sentence
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = sentence
+    
+    if current_chunk:
+        chunks.append(current_chunk)
+    
+    # 如果最后一段是奇数句子
+    if len(sentences) % 2 == 1 and sentences[-1]:
+        if chunks:
+            chunks[-1] += sentences[-1]
+        else:
+            chunks.append(sentences[-1])
+    
+    logger.info(f"Split into {len(chunks)} chunks")
+    
+    # 生成每段音频
+    chunk_files = []
+    for i, chunk in enumerate(chunks):
+        chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
+        try:
+            path = generate_voiceover_volcengine(
+                text=chunk,
+                voice_type=voice_type,
+                speed_ratio=speed_ratio,
+                output_path=chunk_path
+            )
+            chunk_files.append(path)
+        except Exception as e:
+            logger.error(f"Chunk {i} failed: {e}")
+            # 继续处理其他段落
+    
+    if not chunk_files:
+        raise ValueError("All TTS chunks failed")
+    
+    # 使用 FFmpeg 合并音频
+    if len(chunk_files) == 1:
+        if output_path:
+            import shutil
+            shutil.move(chunk_files[0], output_path)
+            return output_path
+        return chunk_files[0]
+    
+    # 创建合并文件列表
+    concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
+    with open(concat_list, "w") as f:
+        for cf in chunk_files:
+            f.write(f"file '{cf}'\n")
+    
+    if not output_path:
+        output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")
+    
+    # FFmpeg 合并
+    import subprocess
+    cmd = [
+        "ffmpeg", "-y",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", str(concat_list),
+        "-c", "copy",
+        output_path
+    ]
+    
+    subprocess.run(cmd, capture_output=True, check=True)
+    
+    # 清理临时文件
+    for cf in chunk_files:
+        try:
+            os.remove(cf)
+        except:
+            pass
+    concat_list.unlink(missing_ok=True)
+    
+    logger.info(f"Merged voiceover: {output_path}")
+    return output_path
+
+
+def generate_scene_voiceovers_volcengine(
+    scenes: List[Dict[str, Any]],
+    voice_type: str = "sweet_female",
+    output_dir: str = None
+) -> List[str]:
+    """
+    为每个场景单独生成旁白音频
+    
+    Args:
+        scenes: 场景列表，每个场景包含 voiceover 字段
+        voice_type: 音色类型
+        output_dir: 输出目录
+    
+    Returns:
+        音频文件路径列表
+    """
+    if output_dir:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True)
+    else:
+        output_dir = config.TEMP_DIR
+    
+    audio_paths = []
+    
+    for i, scene in enumerate(scenes):
+        vo_text = scene.get("voiceover", "")
+        
+        if not vo_text or not vo_text.strip() or vo_text.startswith("("):
+            # 无旁白或是注释
+            audio_paths.append("")
+            continue
+        
+        try:
+            output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
+            path = generate_voiceover_volcengine(
+                text=vo_text.strip(),
+                voice_type=voice_type,
+                output_path=output_path
+            )
+            audio_paths.append(path)
+        except Exception as e:
+            logger.error(f"Scene {i+1} voiceover failed: {e}")
+            audio_paths.append("")
+    
+    return audio_paths