feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions
--- a/modules/init.py
+++ b/modules/init.py
@@ -0,0 +1,14 @@
+"""
+Gloda Video Factory - Modules Package
+"""
+
+__all__ = [
+    "utils", 
+    "brain", 
+    "factory", 
+    "editor",
+    "ffmpeg_utils",
+    "fancy_text",
+    "composer"
+]
+
--- a/modules/asr.py
+++ b/modules/asr.py
@@ -0,0 +1,81 @@
+"""
+MatchMe Studio - ASR Module (Whisper via ShuBiaoBiao)
+"""
+import logging
+import subprocess
+from pathlib import Path
+from typing import Optional
+from openai import OpenAI
+
+import config
+
+logger = logging.getLogger(__name__)
+
+client = OpenAI(
+    api_key=config.SHUBIAOBIAO_KEY,
+    base_url=config.SHUBIAOBIAO_BASE_URL
+)
+
+
+def extract_audio_from_video(video_path: str) -> str:
+    """Extract audio track from video using ffmpeg."""
+    video_path = Path(video_path)
+    audio_path = config.TEMP_DIR / f"{video_path.stem}_audio.mp3"
+    
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", str(video_path),
+        "-vn",  # No video
+        "-acodec", "libmp3lame",
+        "-ar", "16000",  # 16kHz for Whisper
+        "-ac", "1",  # Mono
+        str(audio_path)
+    ]
+    
+    try:
+        subprocess.run(cmd, check=True, capture_output=True)
+        logger.info(f"Audio extracted to {audio_path}")
+        return str(audio_path)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"FFmpeg error: {e.stderr.decode()}")
+        raise RuntimeError("Failed to extract audio from video")
+
+
+def transcribe(audio_path: str) -> str:
+    """Transcribe audio to text using Whisper API."""
+    logger.info(f"Transcribing {audio_path}...")
+    
+    try:
+        with open(audio_path, "rb") as audio_file:
+            response = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file,
+                language="zh",  # Chinese
+                response_format="text"
+            )
+        
+        text = response if isinstance(response, str) else response.text
+        logger.info(f"Transcription complete: {len(text)} chars")
+        return text
+        
+    except Exception as e:
+        logger.error(f"Whisper API error: {e}")
+        raise
+
+
+def transcribe_video(video_path: str) -> str:
+    """Extract audio from video and transcribe."""
+    audio_path = extract_audio_from_video(video_path)
+    return transcribe(audio_path)
+
+
+
+
+
+
+
+
+
+
+
+
--- a/modules/brain.py
+++ b/modules/brain.py
@@ -0,0 +1,346 @@
+"""
+MatchMe Studio - Brain Module (Multi-stage Analysis & Script Generation)
+"""
+import json
+import logging
+from typing import Dict, Any, List, Optional
+from openai import OpenAI
+
+import config
+
+logger = logging.getLogger(__name__)
+
+# Use Volcengine (Doubao) via OpenAI Compatible Interface
+client = OpenAI(
+    api_key=config.VOLC_API_KEY,
+    base_url=config.VOLC_BASE_URL
+)
+
+# ============================================================
+# Stage 1: Analyze Materials
+# ============================================================
+
+ANALYZE_SYSTEM_PROMPT = """你是一位资深短视频创作总监，专精TikTok/抖音爆款内容。
+
+任务：深度分析用户提供的素材和需求，识别产品特性、使用场景、目标人群。
+
+分析维度：
+1. 产品/服务核心卖点（从素材中提取视觉特征）
+2. 视觉风格特征（颜色、质感、包装）
+3. 潜在目标受众
+4. 内容调性建议
+
+然后检查是否缺少关键信息，如果缺少，生成2-5个问题帮助完善需求。
+每个问题必须与短视频创作直接相关。
+
+输出严格JSON格式：
+{
+  "analysis": "详细分析结果，包括从素材中识别到的视觉元素...",
+  "detected_info": {
+    "product": "识别到的产品名称和类型",
+    "visual_features": ["视觉特征1", "视觉特征2"],
+    "audience": "推测的目标人群",
+    "style": "推测的风格"
+  },
+  "missing_info": ["缺少的信息1", "缺少的信息2"],
+  "questions": [
+    {
+      "id": "q1",
+      "text": "问题文字（说明为什么这个问题重要）",
+      "options": ["选项A", "选项B", "选项C"],
+      "allow_multiple": true,
+      "allow_custom": true
+    }
+  ],
+  "ready": false
+}
+
+如果信息足够，ready=true，questions为空数组。
+"""
+
+def analyze_materials(
+    prompt: str,
+    image_urls: List[str] = None,
+    asr_text: str = ""
+) -> Dict[str, Any]:
+    """
+    Deep analysis of user materials.
+    Returns analysis text and questions if info is missing.
+    """
+    logger.info("Brain: Analyzing materials...")
+    
+    # Using Vision Model format (Doubao Vision)
+    # Input format: messages with content list (text + image_url)
+    
+    content_parts = [{"type": "text", "text": f"用户需求: {prompt}"}]
+    
+    if asr_text:
+        content_parts.append({"type": "text", "text": f"\n视频原声(ASR转写): {asr_text}"})
+    
+    if image_urls:
+        content_parts.append({"type": "text", "text": "\n用户上传的素材图片（请仔细分析这些图片中的产品特征）:"})
+        for url in image_urls:
+            content_parts.append({
+                "type": "image_url",
+                "image_url": {"url": url}
+            })
+
+    messages = [
+        # Note: Some vision models might not support 'system' role with images well, 
+        # but Doubao usually follows standard chat structure.
+        # If system prompt fails, prepend it to user content.
+        {"role": "system", "content": ANALYZE_SYSTEM_PROMPT},
+        {"role": "user", "content": content_parts}
+    ]
+
+    try:
+        # Use Vision Model for Analysis
+        response = client.chat.completions.create(
+            model=config.VISION_MODEL_ID, 
+            messages=messages,
+            temperature=0.7,
+            max_tokens=4000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Analyze Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 2: Refine Brief with Answers
+# ============================================================
+
+REFINE_SYSTEM_PROMPT = """你是短视频创作总监。
+根据原始需求、AI分析结果、用户补充回答，整合为完整的创意简报。
+
+注意：用户选择的风格偏好（如ASMR、剧情、视觉流等）必须作为核心创作方向贯穿整个简报。
+
+输出JSON:
+{
+  "brief": {
+    "product": "产品名称",
+    "product_visual_description": "产品视觉描述（颜色、形状、包装、质感等，用于后续图片生成）",
+    "selling_points": ["卖点1", "卖点2"],
+    "target_audience": "目标人群",
+    "platform": "投放平台",
+    "style": "视频风格（必须明确，如ASMR/剧情/视觉流等）",
+    "style_requirements": "该风格的具体创作要求（如ASMR需要：开盖声、质感特写、无人脸等）",
+    "creativity_level": "创意程度",
+    "reference": "对标账号/竞品",
+    "user_assets_description": "用户上传素材的描述（用于后续继承）"
+  },
+  "creative_summary": "整体创意概述（50字以内，描述这个视频的核心创意方向）",
+  "ready": true
+}
+"""
+
+def refine_brief(
+    original_prompt: str,
+    analysis: Dict[str, Any],
+    answers: Dict[str, Any],
+    image_urls: List[str] = None
+) -> Dict[str, Any]:
+    """
+    Integrate user answers into a complete creative brief.
+    """
+    logger.info("Brain: Refining brief with answers...")
+    
+    user_content = f"""
+原始需求: {original_prompt}
+
+AI分析结果: {json.dumps(analysis, ensure_ascii=False)}
+
+用户补充回答: {json.dumps(answers, ensure_ascii=False)}
+
+用户上传的素材URL: {json.dumps(image_urls or [], ensure_ascii=False)}
+"""
+
+    try:
+        # Use Text LLM for reasoning/refining if no new images involved
+        # But to keep it simple, we can stick to BRAIN_MODEL_ID (Doubao Pro)
+        response = client.chat.completions.create(
+            model=config.BRAIN_MODEL_ID,
+            messages=[
+                {"role": "system", "content": REFINE_SYSTEM_PROMPT},
+                {"role": "user", "content": user_content}
+            ],
+            temperature=0.5,
+            max_tokens=3000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Refine Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 3: Generate Script
+# ============================================================
+
+SCRIPT_SYSTEM_PROMPT = """你是顶级短视频编导，专精{style}风格内容创作。
+
+根据创意简报生成爆款脚本。必须严格遵循用户选择的风格要求。
+
+脚本结构要求：
+1. creative_summary: 整体创意概述（这条视频的核心创意是什么）
+2. hook: 前3秒钩子设计（必须抓眼球，符合{style}风格）
+3. scenes: 3-8个分镜
+4. cta: 结尾行动号召（纯文本字符串）
+
+每个分镜(scene)必须包含：
+- id: 分镜编号
+- duration: 时长(5/10/15秒，符合视频模型参数)
+- timeline: 时间轴 (如 "0:00-0:05")
+- image_prompt: 【关键】用于AI生图的详细英文prompt，必须包含：
+    * 产品的具体视觉描述（继承自brief中的product_visual_description）
+    * 8k, hyper-realistic, cinematic lighting
+    * 色调、环境、构图、焦点
+    * 风格要求（如ASMR需要：macro shot, satisfying texture, no human face）
+- keyframe: {
+    "color_tone": "色调",
+    "environment": "环境/背景",
+    "foreground": "前景元素",
+    "focus": "视觉焦点",
+    "subject": "主体描述",
+    "composition": "构图方式"
+  }
+- camera_movement: 运镜描述（如：slow zoom in, pan left, static）
+- story_beat: 这个分镜在整体故事中的作用
+- voiceover: 旁白文字（{style}风格，如ASMR应简短或无旁白，用音效代替）
+- sound_design: 音效设计（如：开盖声、水滴声、环境白噪音）
+- rhythm: {"change": "保持/加快/放慢", "multiplier": 1.0}
+
+旁白要求：
+- 必须连贯，形成完整的叙事
+- 符合{style}风格（ASMR风格应极简或无旁白）
+- 每句旁白要能独立成句，但连起来是完整故事
+
+输出严格JSON格式。
+"""
+
+def generate_script(
+    brief: Dict[str, Any],
+    image_urls: List[str] = None,
+    regenerate_feedback: str = ""
+) -> Dict[str, Any]:
+    """
+    Generate complete video script with scenes.
+    """
+    logger.info("Brain: Generating script...")
+    
+    style = brief.get("style", "现代广告")
+    system_prompt = SCRIPT_SYSTEM_PROMPT.replace("{style}", style)
+    
+    content_parts = [{"type": "text", "text": f"创意简报: {json.dumps(brief, ensure_ascii=False)}"}]
+    
+    if regenerate_feedback:
+        content_parts.append({"type": "text", "text": f"\n用户反馈(请据此调整): {regenerate_feedback}"})
+    
+    if image_urls:
+        content_parts.append({"type": "text", "text": "\n用户上传的参考素材（生成的image_prompt必须参考这些素材中的产品外观）:"})
+        for url in image_urls:
+            content_parts.append({
+                "type": "image_url",
+                "image_url": {"url": url}
+            })
+
+    try:
+        response = client.chat.completions.create(
+            model=config.VISION_MODEL_ID, # Use Vision model to see reference images if available
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": content_parts}
+            ],
+            temperature=0.8,
+            max_tokens=8000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Script Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 4: Regenerate Single Scene
+# ============================================================
+
+def regenerate_scene(
+    full_script: Dict[str, Any],
+    scene_id: int,
+    feedback: str,
+    brief: Dict[str, Any] = None
+) -> Dict[str, Any]:
+    """
+    Regenerate a single scene based on feedback.
+    """
+    logger.info(f"Brain: Regenerating scene {scene_id}...")
+    
+    style = brief.get("style", "现代广告") if brief else "现代广告"
+    
+    system_prompt = f"""你是短视频编导，专精{style}风格。根据用户反馈重新生成指定分镜。
+保持与其他分镜的风格连贯性。
+image_prompt必须继承产品的视觉描述。
+只输出新的scene对象(JSON)。
+"""
+
+    user_content = f"""
+完整脚本: {json.dumps(full_script, ensure_ascii=False)}
+
+创意简报: {json.dumps(brief, ensure_ascii=False) if brief else "无"}
+
+需要重新生成的分镜ID: {scene_id}
+
+用户反馈: {feedback}
+"""
+
+    try:
+        response = client.chat.completions.create(
+            model=config.BRAIN_MODEL_ID,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_content}
+            ],
+            temperature=0.8,
+            max_tokens=2000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Regenerate Scene Error: {e}")
+        raise
--- a/modules/composer.py
+++ b/modules/composer.py
@@ -0,0 +1,717 @@
+"""
+视频合成器模块
+整合视频拼接、花字叠加、旁白配音的完整流程
+"""
+import os
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Union
+
+import config
+from modules import ffmpeg_utils, fancy_text, factory, storage
+from modules.text_renderer import renderer
+
+logger = logging.getLogger(__name__)
+
+
+class VideoComposer:
+    """视频合成器"""
+    
+    def __init__(
+        self,
+        output_dir: str = None,
+        target_size: tuple = (1080, 1920),
+        voice_type: str = "sweet_female"
+    ):
+        """
+        初始化合成器
+        
+        Args:
+            output_dir: 输出目录
+            target_size: 目标分辨率 (width, height)
+            voice_type: 默认旁白音色
+        """
+        self.output_dir = Path(output_dir) if output_dir else config.OUTPUT_DIR
+        self.output_dir.mkdir(exist_ok=True)
+        self.target_size = target_size
+        self.voice_type = voice_type
+        
+        # 临时文件追踪
+        self._temp_files = []
+    
+    def _add_temp(self, path: str):
+        """记录临时文件"""
+        if path:
+            self._temp_files.append(path)
+    
+    def cleanup(self):
+        """清理临时文件"""
+        for f in self._temp_files:
+            try:
+                if os.path.exists(f):
+                    os.remove(f)
+            except Exception as e:
+                logger.warning(f"Failed to cleanup {f}: {e}")
+        self._temp_files = []
+    
+    def compose(
+        self,
+        video_paths: List[str],
+        subtitles: List[Dict[str, Any]] = None,
+        fancy_texts: List[Dict[str, Any]] = None,
+        voiceover_text: str = None,
+        voiceover_segments: List[Dict[str, Any]] = None,
+        bgm_path: str = None,
+        bgm_volume: float = 0.15,
+        output_name: str = None,
+        upload_to_r2: bool = False
+    ) -> str:
+        """
+        完整视频合成流程
+        
+        Args:
+            video_paths: 分镜视频路径列表
+            subtitles: 字幕配置列表 [{text, start, duration, style}]
+            fancy_texts: 花字配置列表 [{text, style, x, y, start, duration}]
+            voiceover_text: 完整旁白文本（会自动生成并混音）
+            voiceover_segments: 分段旁白配置 [{text, start}]，与 voiceover_text 二选一
+            bgm_path: 背景音乐路径
+            bgm_volume: BGM音量
+            output_name: 输出文件名（不含扩展名）
+            upload_to_r2: 是否上传到R2存储
+        
+        Returns:
+            最终视频路径（或R2 URL）
+        """
+        if not video_paths:
+            raise ValueError("No video paths provided")
+        
+        timestamp = int(time.time())
+        output_name = output_name or f"composed_{timestamp}"
+        
+        logger.info(f"Starting composition: {len(video_paths)} videos")
+        
+        try:
+            # Step 1: 拼接视频
+            merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
+            ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
+            self._add_temp(merged_path)
+            current_video = merged_path
+
+            # Step 1.1: 若无音轨，补一条静音底，避免后续滤镜找不到 0:a
+            silent_path = str(config.TEMP_DIR / f"{output_name}_silent.mp4")
+            ffmpeg_utils.add_silence_audio(current_video, silent_path)
+            self._add_temp(silent_path)
+            current_video = silent_path
+            
+            # Step 2: 添加字幕 (白字黑边，无底框，下半区域居中)
+            if subtitles:
+                subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
+                subtitle_style = {
+                    "font": ffmpeg_utils._get_font_path(),
+                    "fontsize": 60,
+                    "fontcolor": "white",
+                    "borderw": 5,
+                    "bordercolor": "black",
+                    "box": 0,  # 无底框
+                    "y": "h-200",  # 下半区域居中
+                }
+                ffmpeg_utils.add_multiple_subtitles(
+                    current_video, subtitles, subtitled_path, default_style=subtitle_style
+                )
+                self._add_temp(subtitled_path)
+                current_video = subtitled_path
+            
+            # Step 3: 叠加花字 (支持原子化参数)
+            if fancy_texts:
+                overlay_configs = []
+                for ft in fancy_texts:
+                    text = ft.get("text", "")
+                    style = ft.get("style")
+                    custom_style = ft.get("custom_style")
+                    
+                    # 如果 style 是字典，说明是原子化参数，直接使用
+                    if isinstance(style, dict):
+                        img_path = renderer.render(text, style, cache=False)
+                    elif custom_style and isinstance(custom_style, dict):
+                        # 兼容旧逻辑：如果有 custom_style，尝试通过原子化渲染器渲染
+                        if "font_size" in custom_style:
+                             img_path = renderer.render(text, custom_style, cache=False)
+                        else:
+                             # 回退到旧版 fancy_text
+                             img_path = fancy_text.create_fancy_text(
+                                text=text,
+                                style=style if isinstance(style, str) else "subtitle",
+                                custom_style={
+                                    **(custom_style or {}),
+                                    "font_name": "/System/Library/Fonts/PingFang.ttc",
+                                },
+                                cache=False
+                            )
+                    else:
+                        # 旧版逻辑
+                        img_path = fancy_text.create_fancy_text(
+                            text=text,
+                            style=style if isinstance(style, str) else "subtitle",
+                            custom_style={
+                                "font_name": "/System/Library/Fonts/PingFang.ttc",
+                            },
+                            cache=False
+                        )
+                        
+                    overlay_configs.append({
+                        "path": img_path,
+                        "x": ft.get("x", "(W-w)/2"),
+                        "y": ft.get("y", "(H-h)/2"),
+                        "start": ft.get("start", 0),
+                        "duration": ft.get("duration", 999)
+                    })
+                
+                fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
+                ffmpeg_utils.overlay_multiple_images(
+                    current_video, overlay_configs, fancy_path
+                )
+                self._add_temp(fancy_path)
+                current_video = fancy_path
+            
+            # Step 4: 生成并混合旁白（火山 WS 优先，失败回退 Edge）
+            if voiceover_text:
+                vo_path = factory.generate_voiceover_volcengine(
+                    text=voiceover_text,
+                    voice_type=self.voice_type
+                )
+                self._add_temp(vo_path)
+                
+                voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
+                ffmpeg_utils.mix_audio(
+                    current_video, vo_path, voiced_path,
+                    audio_volume=1.5,
+                    video_volume=0.2
+                )
+                self._add_temp(voiced_path)
+                current_video = voiced_path
+            
+            elif voiceover_segments:
+                current_video = self._add_segmented_voiceover(
+                    current_video, voiceover_segments, output_name
+                )
+            
+            # Step 5: 添加BGM（淡入淡出，若 duck 失败会自动退回低音量混合）
+            if bgm_path:
+                bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
+                ffmpeg_utils.add_bgm(
+                    current_video, bgm_path, bgm_output,
+                    bgm_volume=bgm_volume,
+                    ducking=False,  # 为避免兼容性问题，这里禁用 duck，保持低音量
+                    duck_gain_db=-6.0,
+                    fade_in=1.0,
+                    fade_out=1.0
+                )
+                self._add_temp(bgm_output)
+                current_video = bgm_output
+            
+            # Step 6: 输出最终文件
+            final_path = str(self.output_dir / f"{output_name}.mp4")
+            
+            # 复制到输出目录
+            import shutil
+            shutil.copy(current_video, final_path)
+            
+            logger.info(f"Composition complete: {final_path}")
+            
+            # 上传到R2
+            if upload_to_r2:
+                r2_url = storage.upload_file(final_path)
+                logger.info(f"Uploaded to R2: {r2_url}")
+                return r2_url
+            
+            return final_path
+            
+        finally:
+            # 清理临时文件（保留最终输出）
+            self.cleanup()
+    
+    def _add_segmented_voiceover(
+        self,
+        video_path: str,
+        segments: List[Dict[str, Any]],
+        output_name: str
+    ) -> str:
+        """添加分段旁白"""
+        if not segments:
+            return video_path
+        
+        # 为每段生成音频
+        audio_files = []
+        for i, seg in enumerate(segments):
+            text = seg.get("text", "")
+            if not text:
+                continue
+            
+            voice = seg.get("voice_type", self.voice_type)
+            audio_path = factory.generate_voiceover_volcengine(
+                text=text,
+                voice_type=voice,
+                output_path=str(config.TEMP_DIR / f"{output_name}_seg_{i}.mp3")
+            )
+            
+            if audio_path:
+                audio_files.append({
+                    "path": audio_path,
+                    "start": seg.get("start", 0)
+                })
+                self._add_temp(audio_path)
+        
+        if not audio_files:
+            return video_path
+        
+        # 依次混入音频
+        current = video_path
+        for i, af in enumerate(audio_files):
+            output = str(config.TEMP_DIR / f"{output_name}_seg_mixed_{i}.mp4")
+            ffmpeg_utils.mix_audio(
+                current, af["path"], output,
+                audio_volume=1.0,
+                video_volume=0.2 if i == 0 else 1.0,  # 只在第一次降低原视频音量
+                audio_start=af["start"]
+            )
+            self._add_temp(output)
+            current = output
+        
+        return current
+
+    def compose_from_script(
+        self,
+        script: Dict[str, Any],
+        video_map: Dict[int, str],
+        bgm_path: str = None,
+        output_name: str = None
+    ) -> str:
+        """
+        基于生成脚本和视频映射进行合成
+        
+        Args:
+            script: 标准化分镜脚本
+            video_map: 场景ID到视频路径的映射
+            bgm_path: BGM路径
+            output_name: 输出文件名
+        """
+        scenes = script.get("scenes", [])
+        if not scenes:
+            raise ValueError("Empty script")
+            
+        video_paths = []
+        fancy_texts = []
+        
+        # 1. 收集视频路径和花字 (按分镜顺序)
+        total_duration = 0.0
+        
+        for scene in scenes:
+            scene_id = scene["id"]
+            video_path = video_map.get(scene_id)
+            
+            if not video_path or not os.path.exists(video_path):
+                logger.warning(f"Missing video for scene {scene_id}, skipping")
+                continue
+            
+            # 获取实际视频时长
+            try:
+                info = ffmpeg_utils.get_video_info(video_path)
+                duration = float(info.get("duration", 5.0))
+            except:
+                duration = 5.0
+            
+            video_paths.append(video_path)
+            
+            # 花字 (白字黑边，无底框，固定在上半区域居中)
+            if "fancy_text" in scene:
+                ft = scene["fancy_text"]
+                if isinstance(ft, dict):
+                    text = ft.get("text", "")
+                    
+                    if text:
+                        # 固定样式：白字黑边，无底框
+                        fixed_style = {
+                            "font_size": 72,
+                            "font_color": "#FFFFFF",
+                            "stroke": {"color": "#000000", "width": 5}
+                            # 无 background，不加底框
+                        }
+
+                        fancy_texts.append({
+                            "text": text,
+                            "style": fixed_style,
+                            "x": "(W-w)/2",  # 居中
+                            "y": "180",      # 上半区域
+                            "start": total_duration + float(ft.get("start_time", 0)),
+                            "duration": float(ft.get("duration", duration))
+                        })
+            
+            total_duration += duration
+
+        # 2. 拼接视频
+        timestamp = int(time.time())
+        output_name = output_name or f"composed_{timestamp}"
+        
+        merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
+        ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
+        self._add_temp(merged_path)
+        current_video = merged_path
+        
+        # 3. 处理整体旁白时间轴 (New Logic)
+        voiceover_timeline = script.get("voiceover_timeline", [])
+        mixed_audio_path = str(config.TEMP_DIR / f"{output_name}_mixed_vo.mp3")
+        
+        # 初始化静音底轨 (长度为 total_duration)
+        ffmpeg_utils._run_ffmpeg([
+            ffmpeg_utils.FFMPEG_PATH, "-y",
+            "-f", "lavfi", "-i", "anullsrc=r=44100:cl=stereo",
+            "-t", str(total_duration),
+            "-c:a", "mp3",
+            mixed_audio_path
+        ])
+        self._add_temp(mixed_audio_path)
+        
+        subtitles = []
+        
+        if voiceover_timeline:
+            for i, item in enumerate(voiceover_timeline):
+                text = item.get("text", "")
+                sub_text = item.get("subtitle", text)
+                
+                # 支持两种格式：
+                # 新格式: start_time (秒), duration (秒) - 直接使用绝对时间
+                # 旧格式: start_ratio (0-1), duration_ratio (0-1) - 按比例计算
+                if "start_time" in item:
+                    # 新格式：直接使用秒
+                    target_start = float(item.get("start_time", 0))
+                    target_duration = float(item.get("duration", 3))
+                else:
+                    # 旧格式：按比例计算（向后兼容）
+                    start_ratio = float(item.get("start_ratio", 0))
+                    duration_ratio = float(item.get("duration_ratio", 0))
+                    target_start = start_ratio * total_duration
+                    target_duration = duration_ratio * total_duration
+                
+                if not text: continue
+                
+                # 生成 TTS
+                tts_path = factory.generate_voiceover_volcengine(
+                    text=text,
+                    voice_type=self.voice_type,
+                    output_path=str(config.TEMP_DIR / f"{output_name}_vo_{i}.mp3")
+                )
+                self._add_temp(tts_path)
+                
+                # 调整时长
+                adjusted_path = str(config.TEMP_DIR / f"{output_name}_vo_adj_{i}.mp3")
+                ffmpeg_utils.adjust_audio_duration(tts_path, target_duration, adjusted_path)
+                self._add_temp(adjusted_path)
+                
+                # 混合到总音轨
+                new_mixed = str(config.TEMP_DIR / f"{output_name}_mixed_{i}.mp3")
+                ffmpeg_utils.mix_audio_at_offset(mixed_audio_path, adjusted_path, target_start, new_mixed)
+                mixed_audio_path = new_mixed # Update current mixed path
+                self._add_temp(new_mixed)
+                
+                # 添加字幕配置 (完全同步)
+                subtitles.append({
+                    "text": ffmpeg_utils.wrap_text_smart(sub_text),
+                    "start": target_start,
+                    "duration": target_duration,
+                    "style": {} # Default
+                })
+        
+        # 4. 将合成好的旁白混入视频
+        voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
+        ffmpeg_utils.mix_audio(
+            current_video, mixed_audio_path, voiced_path,
+            audio_volume=1.5,
+            video_volume=0.2 # 压低原音
+        )
+        self._add_temp(voiced_path)
+        current_video = voiced_path
+        
+        # 5. 添加字幕 (使用新的 ffmpeg_utils.add_multiple_subtitles)
+        if subtitles:
+            subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
+            subtitle_style = {
+                "font": ffmpeg_utils._get_font_path(),
+                "fontsize": 60,
+                "fontcolor": "white",
+                "borderw": 5,
+                "bordercolor": "black",
+                "box": 0,  # 无底框
+                "y": "h-200",  # 下半区域居中
+            }
+            ffmpeg_utils.add_multiple_subtitles(
+                current_video, subtitles, subtitled_path, default_style=subtitle_style
+            )
+            self._add_temp(subtitled_path)
+            current_video = subtitled_path
+            
+        # 6. 添加花字
+        if fancy_texts:
+            fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
+            
+            overlay_configs = []
+            for ft in fancy_texts:
+                # 渲染花字图片
+                img_path = renderer.render(ft["text"], ft["style"], cache=False)
+                overlay_configs.append({
+                    "path": img_path,
+                    "x": ft["x"],
+                    "y": ft["y"],
+                    "start": ft["start"],
+                    "duration": ft["duration"]
+                })
+                
+            ffmpeg_utils.overlay_multiple_images(
+                current_video, overlay_configs, fancy_path
+            )
+            self._add_temp(fancy_path)
+            current_video = fancy_path
+            
+        # 7. 添加 BGM
+        if bgm_path:
+            bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
+            ffmpeg_utils.add_bgm(
+                current_video, bgm_path, bgm_output,
+                bgm_volume=0.15
+            )
+            self._add_temp(bgm_output)
+            current_video = bgm_output
+            
+        # 8. 输出最终文件
+        final_path = str(self.output_dir / f"{output_name}.mp4")
+        import shutil
+        shutil.copy(current_video, final_path)
+        
+        logger.info(f"Composition complete: {final_path}")
+        
+        self.cleanup()
+        return final_path
+
+
+    def compose_standard_task(self, task_config: Dict[str, Any]) -> str:
+        """
+        执行标准合成任务 (Legacy)
+        """
+        settings = task_config.get("settings", {})
+        self.voice_type = settings.get("voice_type", self.voice_type)
+        
+        # 1. 准备视频片段
+        video_paths = []
+        for seg in task_config.get("segments", []):
+            path = seg.get("path") or seg.get("video_path")
+            if not path: continue
+            video_paths.append(path)
+            
+        # 2. 解析时间轴
+        subtitles = []
+        fancy_texts = []
+        voiceover_segments = []
+        
+        for item in task_config.get("timeline", []):
+            itype = item.get("type")
+
+            if not itype:
+                if "text" in item and ("style" in item or "x" in item or "y" in item):
+                    itype = "fancy_text"
+                elif "text" in item and "duration" in item and "start" in item:
+                    itype = "subtitle"
+                elif "text" in item and "start" in item:
+                    itype = "voiceover"
+                else:
+                    continue
+
+            if itype == "subtitle":
+                subtitles.append(item)
+            elif itype == "fancy_text":
+                if "x" not in item and "position" in item:
+                    item["x"] = item["position"].get("x")
+                    item["y"] = item["position"].get("y")
+                fancy_texts.append(item)
+            elif itype == "voiceover":
+                voiceover_segments.append(item)
+        
+        return self.compose(
+            video_paths=video_paths,
+            subtitles=subtitles,
+            fancy_texts=fancy_texts,
+            voiceover_segments=voiceover_segments,
+            bgm_path=settings.get("bgm_path"),
+            bgm_volume=settings.get("bgm_volume", 0.06),
+            output_name=settings.get("output_name"),
+            upload_to_r2=settings.get("upload_to_r2", False)
+        )
+
+
+def compose_product_video(
+    video_paths: List[str],
+    subtitle_configs: List[Dict[str, Any]] = None,
+    fancy_text_configs: List[Dict[str, Any]] = None,
+    voiceover_text: str = None,
+    bgm_path: str = None,
+    output_path: str = None,
+    voice_type: str = "sweet_female"
+) -> str:
+    """便捷函数：合成商品短视频"""
+    composer = VideoComposer(voice_type=voice_type)
+    
+    output_name = None
+    if output_path:
+        output_name = Path(output_path).stem
+        composer.output_dir = Path(output_path).parent
+    
+    return composer.compose(
+        video_paths=video_paths,
+        subtitles=subtitle_configs,
+        fancy_texts=fancy_text_configs,
+        voiceover_text=voiceover_text,
+        bgm_path=bgm_path,
+        output_name=output_name
+    )
+
+
+def quick_compose(
+    video_folder: str,
+    script: List[Dict[str, Any]],
+    output_path: str = None,
+    voice_type: str = "sweet_female",
+    bgm_path: str = None
+) -> str:
+    """快速合成：从文件夹读取视频，配合脚本合成"""
+    folder = Path(video_folder)
+    
+    video_files = sorted([
+        f for f in folder.iterdir() 
+        if f.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv']
+    ])
+    
+    video_paths = []
+    subtitles = []
+    fancy_texts = []
+    voiceovers = []
+    
+    current_time = 0
+    
+    for i, item in enumerate(script):
+        if "video" in item:
+            vp = folder / item["video"]
+        elif i < len(video_files):
+            vp = video_files[i]
+        else:
+            logger.warning(f"No video for script item {i}")
+            continue
+        
+        video_paths.append(str(vp))
+        
+        try:
+            info = ffmpeg_utils.get_video_info(str(vp))
+            duration = info.get("duration", 5)
+        except:
+            duration = item.get("duration", 5)
+        
+        if "subtitle" in item:
+            subtitles.append({
+                "text": item["subtitle"],
+                "start": current_time,
+                "duration": duration,
+                "style": item.get("subtitle_style", {})
+            })
+        
+        if "fancy_text" in item:
+            ft = item["fancy_text"]
+            if isinstance(ft, str):
+                ft = {"text": ft}
+            fancy_texts.append({
+                "text": ft.get("text", ""),
+                "style": ft.get("style", "highlight"),
+                "custom_style": ft.get("custom_style"),
+                "x": ft.get("x", "(W-w)/2"),
+                "y": ft.get("y", 200),
+                "start": current_time,
+                "duration": duration
+            })
+        
+        if "voiceover" in item:
+            voiceovers.append(item["voiceover"])
+        
+        current_time += duration
+    
+    voiceover_text = "。".join(voiceovers) if voiceovers else None
+    
+    return compose_product_video(
+        video_paths=video_paths,
+        subtitle_configs=subtitles if subtitles else None,
+        fancy_text_configs=fancy_texts if fancy_texts else None,
+        voiceover_text=voiceover_text,
+        bgm_path=bgm_path,
+        output_path=output_path,
+        voice_type=voice_type
+    )
+
+
+# ============================================================
+# 示例用法
+# ============================================================
+
+def example_hairclip_video():
+    """示例：发夹商品视频合成"""
+    素材目录 = Path("/Volumes/Tony/video-flow/素材/发夹/合成图拆分镜")
+    
+    video_paths = [
+        str(素材目录 / "视频-分镜1.mp4"),
+        str(素材目录 / "视频-分镜2.mp4"),
+        str(素材目录 / "视频-分镜3.mp4"),
+        str(素材目录 / "视频-分镜4.mp4"),
+        str(素材目录 / "视频-分镜5.mp4"),
+    ]
+    
+    script = [
+        {
+            "subtitle": "塌马尾 vs 高颅顶",
+            "fancy_text": {
+                "text": "塌马尾 vs 高颅顶",
+                "style": "comparison",
+                "y": 150
+            },
+            "voiceover": "普通马尾和高颅顶马尾的区别，你看出来了吗",
+        },
+        {
+            "subtitle": "3秒出门，无需皮筋",
+            "fancy_text": {"text": "发量+50%", "style": "bubble", "y": 300},
+            "voiceover": "只需要三秒钟，不需要皮筋，发量瞬间增加百分之五十",
+        },
+        {
+            "subtitle": "发量+50%",
+            "voiceover": "蓬松的高颅顶效果，让你瞬间变美",
+        },
+        {
+            "subtitle": "狂甩不掉！",
+            "fancy_text": {"text": "狂甩不掉！", "style": "warning", "y": 400},
+            "voiceover": "而且超级牢固，怎么甩都不会掉",
+        },
+        {
+            "subtitle": "¥3.99 立即抢购",
+            "fancy_text": {"text": "3.99", "style": "price", "y": 500},
+            "voiceover": "只要三块九毛九，点击下方链接立即购买",
+        },
+    ]
+    
+    output = quick_compose(
+        video_folder=str(素材目录),
+        script=script,
+        output_path="/Volumes/Tony/video-flow/output/发夹_合成视频.mp4",
+        voice_type="sweet_female"
+    )
+    
+    print(f"视频合成完成: {output}")
+    return output
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    example_hairclip_video()
--- a/modules/db_manager.py
+++ b/modules/db_manager.py
@@ -0,0 +1,305 @@
+"""
+数据库管理模块 (SQLAlchemy)
+负责项目数据、任务状态、素材路径的持久化存储
+支持 SQLite 和 PostgreSQL
+"""
+import json
+import logging
+import time
+from typing import Dict, List, Any, Optional
+
+from sqlalchemy import create_engine, Column, String, Integer, Text, Float, UniqueConstraint, func
+from sqlalchemy.orm import sessionmaker, scoped_session, declarative_base
+from sqlalchemy.dialects.postgresql import JSONB
+
+import config
+
+logger = logging.getLogger(__name__)
+
+Base = declarative_base()
+
+class Project(Base):
+    __tablename__ = 'projects'
+    
+    id = Column(String, primary_key=True)
+    name = Column(String)
+    status = Column(String) # created, script_generated, images_generated, videos_generated, completed
+    product_info = Column(Text) # JSON string (SQLite) or JSONB (PG - using Text for compat)
+    script_data = Column(Text) # JSON string
+    created_at = Column(Float, default=time.time)
+    updated_at = Column(Float, default=time.time, onupdate=time.time)
+
+class SceneAsset(Base):
+    __tablename__ = 'scene_assets'
+    
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    project_id = Column(String, index=True)
+    scene_id = Column(Integer)
+    asset_type = Column(String) # image, video
+    status = Column(String) # pending, processing, completed, failed
+    local_path = Column(Text, nullable=True)
+    remote_url = Column(Text, nullable=True)
+    task_id = Column(String, nullable=True) # 外部 API 的任务 ID
+    metadata_json = Column("metadata", Text, nullable=True) # JSON string (renamed to avoid conflict with metadata attr)
+    created_at = Column(Float, default=time.time)
+    updated_at = Column(Float, default=time.time, onupdate=time.time)
+    
+    __table_args__ = (UniqueConstraint('project_id', 'scene_id', 'asset_type', name='uix_project_scene_asset'),)
+
+class AppConfig(Base):
+    __tablename__ = 'app_config'
+    
+    key = Column(String, primary_key=True)
+    value = Column(Text) # JSON string
+    description = Column(Text, nullable=True)
+    updated_at = Column(Float, default=time.time, onupdate=time.time)
+
+class DBManager:
+    def __init__(self, connection_string: str = None):
+        if not connection_string:
+            connection_string = config.DB_CONNECTION_STRING
+            
+        self.engine = create_engine(connection_string, pool_recycle=3600)
+        self.Session = scoped_session(sessionmaker(bind=self.engine))
+        self._init_db()
+
+    def _init_db(self):
+        """初始化表结构"""
+        Base.metadata.create_all(self.engine)
+
+    def _get_session(self):
+        return self.Session()
+
+    # --- Project Operations ---
+
+    def create_project(self, project_id: str, name: str, product_info: Dict[str, Any]):
+        session = self._get_session()
+        try:
+            # Check if exists
+            existing = session.query(Project).filter_by(id=project_id).first()
+            if existing:
+                logger.warning(f"Project {project_id} already exists.")
+                return
+
+            new_project = Project(
+                id=project_id,
+                name=name,
+                status="created",
+                product_info=json.dumps(product_info, ensure_ascii=False),
+                created_at=time.time(),
+                updated_at=time.time()
+            )
+            session.add(new_project)
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error creating project: {e}")
+            raise
+        finally:
+            session.close()
+
+    def update_project_script(self, project_id: str, script: Dict[str, Any]):
+        session = self._get_session()
+        try:
+            project = session.query(Project).filter_by(id=project_id).first()
+            if project:
+                project.script_data = json.dumps(script, ensure_ascii=False)
+                project.status = "script_generated"
+                project.updated_at = time.time()
+                session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error updating script: {e}")
+        finally:
+            session.close()
+
+    def update_project_status(self, project_id: str, status: str):
+        session = self._get_session()
+        try:
+            project = session.query(Project).filter_by(id=project_id).first()
+            if project:
+                project.status = status
+                project.updated_at = time.time()
+                session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error updating status: {e}")
+        finally:
+            session.close()
+        
+    def get_project(self, project_id: str) -> Optional[Dict[str, Any]]:
+        session = self._get_session()
+        try:
+            project = session.query(Project).filter_by(id=project_id).first()
+            if project:
+                data = {
+                    "id": project.id,
+                    "name": project.name,
+                    "status": project.status,
+                    "product_info": json.loads(project.product_info) if project.product_info else {},
+                    "script_data": json.loads(project.script_data) if project.script_data else None,
+                    "created_at": project.created_at,
+                    "updated_at": project.updated_at
+                }
+                return data
+            return None
+        finally:
+            session.close()
+
+    def list_projects(self) -> List[Dict[str, Any]]:
+        session = self._get_session()
+        try:
+            projects = session.query(Project).order_by(Project.updated_at.desc()).all()
+            results = []
+            for p in projects:
+                results.append({
+                    "id": p.id,
+                    "name": p.name,
+                    "status": p.status,
+                    "updated_at": p.updated_at
+                })
+            return results
+        finally:
+            session.close()
+
+    # --- Asset/Task Operations ---
+
+    def save_asset(self, project_id: str, scene_id: int, asset_type: str, 
+                  status: str, local_path: str = None, remote_url: str = None, 
+                  task_id: str = None, metadata: Dict = None):
+        """保存或更新资产记录 (UPSERT 逻辑)"""
+        session = self._get_session()
+        try:
+            asset = session.query(SceneAsset).filter_by(
+                project_id=project_id, 
+                scene_id=scene_id, 
+                asset_type=asset_type
+            ).first()
+
+            meta_json = json.dumps(metadata, ensure_ascii=False) if metadata else "{}"
+
+            if asset:
+                asset.status = status
+                asset.local_path = local_path
+                asset.remote_url = remote_url
+                asset.task_id = task_id
+                asset.metadata_json = meta_json
+                asset.updated_at = time.time()
+            else:
+                new_asset = SceneAsset(
+                    project_id=project_id,
+                    scene_id=scene_id,
+                    asset_type=asset_type,
+                    status=status,
+                    local_path=local_path,
+                    remote_url=remote_url,
+                    task_id=task_id,
+                    metadata_json=meta_json,
+                    created_at=time.time(),
+                    updated_at=time.time()
+                )
+                session.add(new_asset)
+            
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error saving asset: {e}")
+        finally:
+            session.close()
+
+    def get_assets(self, project_id: str, asset_type: str = None) -> List[Dict[str, Any]]:
+        session = self._get_session()
+        try:
+            query = session.query(SceneAsset).filter_by(project_id=project_id)
+            if asset_type:
+                query = query.filter_by(asset_type=asset_type)
+            
+            assets = query.all()
+            results = []
+            for a in assets:
+                data = {
+                    "id": a.id,
+                    "project_id": a.project_id,
+                    "scene_id": a.scene_id,
+                    "asset_type": a.asset_type,
+                    "status": a.status,
+                    "local_path": a.local_path,
+                    "remote_url": a.remote_url,
+                    "task_id": a.task_id,
+                    "metadata": json.loads(a.metadata_json) if a.metadata_json else {},
+                    "updated_at": a.updated_at
+                }
+                results.append(data)
+            return results
+        finally:
+            session.close()
+
+    def get_asset(self, project_id: str, scene_id: int, asset_type: str) -> Optional[Dict[str, Any]]:
+        session = self._get_session()
+        try:
+            a = session.query(SceneAsset).filter_by(
+                project_id=project_id, 
+                scene_id=scene_id, 
+                asset_type=asset_type
+            ).first()
+            
+            if a:
+                return {
+                    "id": a.id,
+                    "project_id": a.project_id,
+                    "scene_id": a.scene_id,
+                    "asset_type": a.asset_type,
+                    "status": a.status,
+                    "local_path": a.local_path,
+                    "remote_url": a.remote_url,
+                    "task_id": a.task_id,
+                    "metadata": json.loads(a.metadata_json) if a.metadata_json else {},
+                    "updated_at": a.updated_at
+                }
+            return None
+        finally:
+            session.close()
+
+    # --- Config/Prompt Operations ---
+
+    def get_config(self, key: str, default: Any = None) -> Any:
+        session = self._get_session()
+        try:
+            cfg = session.query(AppConfig).filter_by(key=key).first()
+            if cfg:
+                try:
+                    return json.loads(cfg.value)
+                except:
+                    return cfg.value
+            return default
+        finally:
+            session.close()
+
+    def set_config(self, key: str, value: Any, description: str = None):
+        session = self._get_session()
+        try:
+            json_val = json.dumps(value, ensure_ascii=False)
+            
+            cfg = session.query(AppConfig).filter_by(key=key).first()
+            if cfg:
+                cfg.value = json_val
+                if description:
+                    cfg.description = description
+                cfg.updated_at = time.time()
+            else:
+                new_cfg = AppConfig(
+                    key=key,
+                    value=json_val,
+                    description=description,
+                    updated_at=time.time()
+                )
+                session.add(new_cfg)
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error setting config: {e}")
+        finally:
+            session.close()
+
+# Singleton instance
+db = DBManager()
--- a/modules/editor.py
+++ b/modules/editor.py
@@ -0,0 +1,269 @@
+"""
+MatchMe Studio - Editor Module (Assembly + BGM)
+"""
+import logging
+import requests
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from moviepy.editor import (
+    VideoFileClip, AudioFileClip, TextClip, 
+    CompositeVideoClip, CompositeAudioClip,
+    concatenate_videoclips
+)
+
+import config
+from modules import storage
+
+logger = logging.getLogger(__name__)
+
+
+# ============================================================
+# Video Assembly
+# ============================================================
+
+def download_video(url: str) -> str:
+    """Download video from URL to temp."""
+    filename = f"dl_{Path(url).name}"
+    local_path = config.TEMP_DIR / filename
+    
+    with open(local_path, "wb") as f:
+        f.write(requests.get(url).content)
+    
+    return str(local_path)
+
+
+def concatenate_scenes(video_urls: List[str]) -> str:
+    """Concatenate multiple video clips into one."""
+    logger.info(f"Concatenating {len(video_urls)} clips...")
+    
+    clips = []
+    for url in video_urls:
+        local_path = download_video(url)
+        clip = VideoFileClip(local_path)
+        
+        # Resize to 9:16 if needed
+        if clip.w != 1080 or clip.h != 1920:
+            clip = clip.resize(newsize=(1080, 1920))
+        
+        clips.append(clip)
+    
+    final = concatenate_videoclips(clips, method="compose")
+    
+    output_path = config.TEMP_DIR / f"merged_{int(__import__('time').time())}.mp4"
+    final.write_videofile(
+        str(output_path),
+        fps=30,
+        codec="libx264",
+        audio_codec="aac",
+        threads=4,
+        logger=None
+    )
+    
+    # Cleanup
+    for clip in clips:
+        clip.close()
+    final.close()
+    
+    return str(output_path)
+
+
+# ============================================================
+# Subtitle Burning
+# ============================================================
+
+def burn_subtitles(
+    video_path: str,
+    scenes: List[Dict[str, Any]]
+) -> str:
+    """Burn subtitles onto video."""
+    logger.info("Burning subtitles...")
+    
+    clip = VideoFileClip(video_path)
+    subtitle_clips = []
+    
+    current_time = 0
+    for scene in scenes:
+        voiceover = scene.get("voiceover", "")
+        duration = scene.get("duration", 5)
+        
+        if voiceover:
+            try:
+                txt = TextClip(
+                    voiceover,
+                    fontsize=48,
+                    color='white',
+                    stroke_color='black',
+                    stroke_width=2,
+                    font='DejaVu-Sans',
+                    method='caption',
+                    size=(900, None)
+                ).set_position(('center', 1600)).set_start(current_time).set_duration(duration)
+                
+                subtitle_clips.append(txt)
+            except Exception as e:
+                logger.warning(f"Subtitle error: {e}")
+        
+        current_time += duration
+    
+    if subtitle_clips:
+        final = CompositeVideoClip([clip] + subtitle_clips)
+    else:
+        final = clip
+    
+    output_path = config.TEMP_DIR / f"subtitled_{int(__import__('time').time())}.mp4"
+    final.write_videofile(
+        str(output_path),
+        fps=30,
+        codec="libx264",
+        audio_codec="aac",
+        threads=4,
+        logger=None
+    )
+    
+    clip.close()
+    final.close()
+    
+    return str(output_path)
+
+
+# ============================================================
+# Voiceover Mixing
+# ============================================================
+
+def mix_voiceover(video_path: str, voiceover_url: str) -> str:
+    """Mix voiceover audio with video."""
+    if not voiceover_url:
+        return video_path
+        
+    logger.info("Mixing voiceover...")
+    
+    # Download voiceover
+    vo_local = download_video(voiceover_url)
+    
+    video = VideoFileClip(video_path)
+    voiceover = AudioFileClip(vo_local)
+    
+    # Trim voiceover if longer than video
+    if voiceover.duration > video.duration:
+        voiceover = voiceover.subclip(0, video.duration)
+    
+    # Mix with original audio (if any)
+    if video.audio:
+        mixed = CompositeAudioClip([
+            video.audio.volumex(0.3),  # Lower original
+            voiceover.volumex(1.0)
+        ])
+    else:
+        mixed = voiceover
+    
+    final = video.set_audio(mixed)
+    
+    output_path = config.TEMP_DIR / f"voiced_{int(__import__('time').time())}.mp4"
+    final.write_videofile(
+        str(output_path),
+        fps=30,
+        codec="libx264",
+        audio_codec="aac",
+        threads=4,
+        logger=None
+    )
+    
+    video.close()
+    voiceover.close()
+    final.close()
+    
+    return str(output_path)
+
+
+# ============================================================
+# BGM Mixing
+# ============================================================
+
+def mix_bgm(
+    video_path: str,
+    bgm_path: str,
+    bgm_volume: float = 0.2
+) -> str:
+    """Mix background music with video."""
+    logger.info("Mixing BGM...")
+    
+    video = VideoFileClip(video_path)
+    bgm = AudioFileClip(bgm_path)
+    
+    # Loop BGM if shorter than video
+    if bgm.duration < video.duration:
+        loops_needed = int(video.duration / bgm.duration) + 1
+        bgm = bgm.loop(loops_needed)
+    
+    # Trim to video length
+    bgm = bgm.subclip(0, video.duration).volumex(bgm_volume)
+    
+    # Mix with existing audio
+    if video.audio:
+        mixed = CompositeAudioClip([video.audio, bgm])
+    else:
+        mixed = bgm
+    
+    final = video.set_audio(mixed)
+    
+    output_path = config.TEMP_DIR / f"bgm_{int(__import__('time').time())}.mp4"
+    final.write_videofile(
+        str(output_path),
+        fps=30,
+        codec="libx264",
+        audio_codec="aac",
+        threads=4,
+        logger=None
+    )
+    
+    video.close()
+    bgm.close()
+    final.close()
+    
+    return str(output_path)
+
+
+# ============================================================
+# Full Pipeline
+# ============================================================
+
+def assemble_final_video(
+    video_urls: List[str],
+    scenes: List[Dict[str, Any]],
+    voiceover_url: str = "",
+    bgm_url: str = ""
+) -> str:
+    """
+    Full assembly pipeline:
+    1. Concatenate scene videos
+    2. Burn subtitles
+    3. Mix voiceover
+    4. Mix BGM
+    5. Upload to R2
+    """
+    logger.info("Starting full assembly...")
+    
+    # Step 1: Concatenate
+    merged = concatenate_scenes(video_urls)
+    
+    # Step 2: Subtitles
+    subtitled = burn_subtitles(merged, scenes)
+    
+    # Step 3: Voiceover
+    if voiceover_url:
+        voiced = mix_voiceover(subtitled, voiceover_url)
+    else:
+        voiced = subtitled
+    
+    # Step 4: BGM
+    if bgm_url:
+        bgm_local = download_video(bgm_url)
+        final_path = mix_bgm(voiced, bgm_local)
+    else:
+        final_path = voiced
+    
+    # Step 5: Upload
+    final_url = storage.upload_file(final_path)
+    logger.info(f"Final video uploaded: {final_url}")
+    
+    return final_url
--- a/modules/export_utils.py
+++ b/modules/export_utils.py
@@ -0,0 +1,157 @@
+import os
+import zipfile
+import logging
+import shutil
+import math
+from pathlib import Path
+from typing import List, Dict, Any
+import config
+
+logger = logging.getLogger(__name__)
+
+def format_timestamp(seconds: float) -> str:
+    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    millis = int((seconds - int(seconds)) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+
+def generate_srt(script_data: Dict[str, Any], video_map: Dict[int, str]) -> str:
+    """Generate SRT content from script data"""
+    scenes = script_data.get("scenes", [])
+    srt_content = ""
+    current_time = 0.0
+    
+    # Need to get durations from actual videos if possible, else estimate
+    from modules import ffmpeg_utils
+    
+    for i, scene in enumerate(scenes):
+        scene_id = scene["id"]
+        # Get duration
+        duration = 5.0
+        if scene_id in video_map and os.path.exists(video_map[scene_id]):
+            try:
+                info = ffmpeg_utils.get_video_info(video_map[scene_id])
+                duration = info.get("duration", 5.0)
+            except:
+                pass
+        
+        start_time = current_time
+        end_time = current_time + duration
+        current_time = end_time
+        
+        text = scene.get("subtitle", "")
+        if text:
+            srt_content += f"{i+1}\n"
+            srt_content += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
+            srt_content += f"{text}\n\n"
+            
+    return srt_content
+
+def create_capcut_package(project_id: str, script_data: Dict[str, Any], assets: Dict[str, str]) -> str:
+    """
+    Create a ZIP package for CapCut (JianYing) import
+    Contains:
+    - videos/ (scene videos)
+    - audios/ (voiceover, bgm)
+    - images/ (fancy text transparent pngs)
+    - subtitles.srt
+    """
+    package_dir = config.TEMP_DIR / f"capcut_pkg_{project_id}_{int(os.getpid())}"
+    if package_dir.exists():
+        shutil.rmtree(package_dir)
+    package_dir.mkdir()
+    
+    (package_dir / "videos").mkdir()
+    (package_dir / "audios").mkdir()
+    (package_dir / "images").mkdir()
+    
+    # 1. Generate SRT
+    # Need to reconstruct video map from assets or script
+    # Assuming 'assets' contains 'scene_videos' map
+    scene_videos = assets.get("scene_videos", {})
+    srt_content = generate_srt(script_data, scene_videos)
+    with open(package_dir / "subtitles.srt", "w", encoding="utf-8") as f:
+        f.write(srt_content)
+        
+    # 2. Copy Videos
+    scenes = script_data.get("scenes", [])
+    for i, scene in enumerate(scenes):
+        sid = scene["id"]
+        if sid in scene_videos and os.path.exists(scene_videos[sid]):
+            # Rename with sequence number for easy sorting: 01_scene.mp4
+            ext = Path(scene_videos[sid]).suffix
+            dest_name = f"{i+1:02d}_scene_{sid}{ext}"
+            shutil.copy(scene_videos[sid], package_dir / "videos" / dest_name)
+            
+    # 3. Copy Audio (Voiceover)
+    # We might not have the separate voiceover file easily accessible if it was mixed on the fly.
+    # But usually we generate it to temp.
+    # Option: Re-generate voiceover audio for the whole track or segments?
+    # Better: If we have 'voiceover_segments', generate them or copy if cached.
+    # For now, let's try to find if we have a full voiceover file or just use segments.
+    # Simplest: Re-generate the full voiceover audio file if it doesn't exist as a standalone asset.
+    # Or check if user just wants the pieces.
+    # Let's check if we have a mixed audio file. Usually we don't save the intermediate audio as an asset.
+    # So we might need to re-generate the voiceover audio here.
+    from modules import factory
+    full_vo_text = " ".join([s.get("voiceover", "") for s in scenes if s.get("voiceover")])
+    if full_vo_text:
+        try:
+            # Assuming default voice
+            voice_type = config.VOLC_TTS_DEFAULT_VOICE
+            vo_path = factory.generate_voiceover_volcengine(full_vo_text, voice_type)
+            shutil.copy(vo_path, package_dir / "audios" / "full_voiceover.mp3")
+        except Exception as e:
+            logger.warning(f"Failed to generate export voiceover: {e}")
+            
+    # Copy BGM
+    # Check settings or script for BGM? BGM is usually a global setting in Composer.
+    # We'll just look for BGM in assets folder or let user drag their own.
+    # Or if we saved the BGM selection in the project, we could copy it.
+    # For now, skip specific BGM unless we know which one was used.
+    
+    # 4. Copy Fancy Text Images
+    # We need to re-render them or find them. 
+    # Since they are generated to temp in composer, they might be gone.
+    # Safer to re-render them.
+    from modules.text_renderer import renderer
+    for i, scene in enumerate(scenes):
+        ft = scene.get("fancy_text")
+        if ft:
+            text = ft.get("text", "") if isinstance(ft, dict) else ""
+            style = ft.get("style", "highlight") if isinstance(ft, dict) else "highlight"
+            if text:
+                try:
+                    # Render
+                    if isinstance(style, str):
+                         # Simple mapping or default
+                         # We need the full style dict logic from composer ideally
+                         # For export, we just use default render
+                         pass
+                    
+                    # Actually, composer logic for style resolution is complex.
+                    # Let's just use a simple render here.
+                    img_path = renderer.render(text, {"font_size": 60, "font_color": "#FFFFFF"}, cache=False)
+                    shutil.copy(img_path, package_dir / "images" / f"{i+1:02d}_text_{scene['id']}.png")
+                except:
+                    pass
+
+    # 5. Zip it
+    zip_path = config.TEMP_DIR / f"capcut_export_{project_id}.zip"
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for root, dirs, files in os.walk(package_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                arcname = os.path.relpath(file_path, package_dir)
+                zipf.write(file_path, arcname)
+                
+    # Cleanup
+    shutil.rmtree(package_dir)
+    return str(zip_path)
+
+
+
+
+
--- a/modules/factory.py
+++ b/modules/factory.py
@@ -0,0 +1,801 @@
+"""
+MatchMe Studio - Factory Module (Concurrent Scene Generation)
+Using Volcengine (Doubao) API for Image and Video
+"""
+import os
+import time
+import logging
+import requests
+import json
+import re
+import base64
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from elevenlabs import ElevenLabs, VoiceSettings
+from openai import OpenAI
+
+import config
+from modules import storage
+
+logger = logging.getLogger(__name__)
+
+# Initialize OpenAI Client for Volcengine Image Generation
+client = OpenAI(
+    api_key=config.VOLC_API_KEY,
+    base_url=config.VOLC_BASE_URL
+)
+
+# ============================================================
+# Helper Functions
+# ============================================================
+
+def _download_as_base64(url: str) -> str:
+    """Download image from URL and convert to Base64."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return base64.b64encode(response.content).decode('utf-8')
+    except Exception as e:
+        logger.error(f"Failed to download/encode image: {e}")
+        return ""
+
+# ============================================================
+# Image Generation (Doubao / Volcengine)
+# ============================================================
+
+def generate_scene_image(
+    scene: Dict[str, Any],
+    brief: Dict[str, Any] = None,
+    reference_images: List[str] = None
+) -> str:
+    """
+    Generate image using Volcengine API (Doubao Image).
+    Using raw requests to match user's curl example exactly.
+    """
+    # Build prompt
+    image_prompt = scene.get("image_prompt", "")
+    if not image_prompt:
+        # Fallback prompt construction
+        keyframe = scene.get("keyframe", {})
+        # Stronger style consistency intro
+        parts = ["Cinematic shot, 8k, photorealistic"]
+        if brief:
+            if brief.get("product_visual_description"):
+                parts.append(f"Product: {brief['product_visual_description']}")
+        parts.extend([
+            f"Subject: {keyframe.get('subject', 'product')}",
+            f"Environment: {keyframe.get('environment', 'studio')}",
+            f"Action: {keyframe.get('focus', '')}"
+        ])
+        image_prompt = ", ".join(parts)
+    
+    # Append explicit consistency enforcement to prompt
+    if brief and brief.get("product_visual_description"):
+        if brief['product_visual_description'] not in image_prompt:
+             image_prompt = f"{brief['product_visual_description']}, {image_prompt}"
+
+    logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")
+
+    url = f"{config.VOLC_BASE_URL}/images/generations"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.VOLC_API_KEY}"
+    }
+    
+    # Payload matching user's curl example
+    payload = {
+        "model": config.IMAGE_MODEL_ID,
+        "prompt": image_prompt,
+        "sequential_image_generation": "disabled",
+        "response_format": "b64_json", # Use base64 to avoid temp url expiration issues
+        "size": "2K", # User specified 2K
+        "stream": False,
+        "watermark": True
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=60)
+        
+        if response.status_code != 200:
+            logger.error(f"Image API Error: {response.text}")
+            raise ValueError(f"Image API failed: {response.status_code} - {response.text}")
+            
+        data = response.json()
+        
+        # Extract Image Data
+        image_data = None
+        if "data" in data and len(data["data"]) > 0:
+             image_data = data["data"][0].get("b64_json")
+             if not image_data:
+                 # Fallback to URL download if b64 not present
+                 img_url = data["data"][0].get("url")
+                 if img_url:
+                     # Download the image to ensure we have it locally
+                     image_data = _download_as_base64(img_url)
+
+        if not image_data:
+            raise ValueError("No image data returned")
+
+        # Decode and Save
+        filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
+        local_path = config.TEMP_DIR / filename
+        
+        with open(local_path, "wb") as f:
+            f.write(base64.b64decode(image_data))
+        
+        # Upload to R2
+        r2_url = storage.upload_file(str(local_path))
+        logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
+        return r2_url
+
+    except Exception as e:
+        logger.error(f"Image Generation Failed: {e}")
+        raise
+
+
+def generate_all_scene_images_concurrent(
+    scenes: List[Dict[str, Any]],
+    brief: Dict[str, Any] = None,
+    reference_images: List[str] = None,
+    max_workers: int = 3
+) -> List[str]:
+    """Generate images for all scenes concurrently."""
+    logger.info(f"Generating {len(scenes)} images concurrently...")
+    image_urls = [None] * len(scenes)
+    
+    def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
+        url = generate_scene_image(scene, brief, reference_images)
+        return index, url
+    
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(generate_single, i, scene): i 
+            for i, scene in enumerate(scenes)
+        }
+        
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                _, url = future.result()
+                image_urls[index] = url
+            except Exception as e:
+                logger.error(f"Scene {index+1} failed: {e}")
+    
+    return image_urls
+
+
+# ============================================================
+# Video Generation (Doubao Video / PixelDance)
+# ============================================================
+
+def generate_scene_video(
+    start_frame_url: str,
+    motion_prompt: str,
+    duration: int = 5
+) -> str:
+    """
+    Generate video using Volcengine API (Async Task Flow).
+    """
+    logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")
+
+    # 1. Create Task
+    create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.VOLC_API_KEY}"
+    }
+
+    # Construct Content List (Text + Optional Image)
+    content_list = [
+        {
+            "type": "text", 
+            "text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
+        }
+    ]
+    
+    if start_frame_url:
+        content_list.append({
+            "type": "image_url",
+            "image_url": {"url": start_frame_url}
+        })
+
+    payload = {
+        "model": config.VIDEO_MODEL_ID,
+        "content": content_list
+    }
+
+    try:
+        response = requests.post(create_url, headers=headers, json=payload, timeout=30)
+        if response.status_code != 200:
+            # 202 Accepted is also possible for async tasks
+            if response.status_code != 202: 
+                 logger.error(f"Video Task Creation Error: {response.text}")
+                 raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")
+        
+        data = response.json()
+        task_id = data.get("id")
+        if not task_id:
+             # Sometimes ID is in data.id or similar
+             task_id = data.get("data", {}).get("id")
+        
+        if not task_id:
+            raise ValueError(f"No Task ID returned: {data}")
+
+        logger.info(f"Video Task Created: {task_id}. Polling for result...")
+
+        # 2. Poll for Result
+        # GET /contents/generations/tasks/{id}
+        max_retries = 60 # 5 mins max (5s interval)
+        video_url = None
+        
+        for _ in range(max_retries):
+            time.sleep(5)
+            status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
+            resp = requests.get(status_url, headers=headers, timeout=30)
+            
+            if resp.status_code == 200:
+                res_data = resp.json()
+                # Check status
+                # Structure usually: data.status = "succeeded" / "running" / "failed"
+                # Or top level status
+                
+                status = res_data.get("status")
+                if not status and "data" in res_data:
+                    status = res_data["data"].get("status")
+                
+                if status == "succeeded" or status == "SUCCEEDED":
+                    # Extract URL
+                    content = res_data.get("data", {}).get("content", [])
+                    if not content and "content" in res_data:
+                        content = res_data["content"]
+                        
+                    # Find video url in content
+                    # Content is usually list of dicts with type='video' or 'video_url'
+                    for item in content:
+                        if item.get("video_url"):
+                            video_url = item["video_url"]
+                            break
+                        if item.get("url"): # sometimes just url
+                             video_url = item["url"]
+                             break
+                    
+                    if video_url:
+                        break
+                elif status == "failed" or status == "FAILED":
+                     reason = res_data.get("data", {}).get("error", "Unknown error")
+                     raise ValueError(f"Video Generation Failed: {reason}")
+                
+                # If running/queued, continue waiting
+            
+        if not video_url:
+            raise TimeoutError("Video generation timed out or failed to return URL.")
+
+        # 3. Download and Upload to R2
+        logger.info(f"Video Generated. Downloading: {video_url}")
+        filename = f"vid_doubao_{int(time.time())}.mp4"
+        local_path = config.TEMP_DIR / filename
+        
+        resp = requests.get(video_url, stream=True)
+        if resp.status_code != 200:
+            raise ValueError(f"Failed to download generated video: {resp.status_code}")
+            
+        with open(local_path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
+        
+        r2_url = storage.upload_file(str(local_path))
+        return r2_url
+
+    except Exception as e:
+        logger.error(f"Video Generation Error: {e}")
+        raise
+
+
+def generate_all_scene_videos_concurrent(
+    scenes: List[Dict[str, Any]],
+    image_urls: List[str],
+    max_workers: int = 2
+) -> List[str]:
+    """Generate videos concurrently."""
+    logger.info(f"Generating {len(scenes)} videos concurrently...")
+    video_urls = [None] * len(scenes)
+    
+    def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
+        motion = scene.get("camera_movement", "slow zoom")
+        if scene.get("image_prompt"):
+             motion = f"{scene['image_prompt']}. {motion}"
+             
+        duration = scene.get("duration", 5)
+        url = generate_scene_video(img_url, motion, duration)
+        return index, url
+    
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(generate_single, i, scene, image_urls[i]): i
+            for i, scene in enumerate(scenes)
+        }
+        
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                _, url = future.result()
+                video_urls[index] = url
+            except Exception as e:
+                logger.error(f"Scene {index+1} video failed: {e}")
+    
+    return video_urls
+
+
+# ============================================================
+# Audio Generation (ElevenLabs)
+# ============================================================
+
+def generate_voiceover(text: str, style: str = "") -> str:
+    """Generate voiceover audio. Returns R2 URL."""
+    if not text or not text.strip():
+        return ""
+    
+    stability = 0.3 if "ASMR" in style else 0.5
+    similarity = 0.9 if "ASMR" in style else 0.8
+        
+    logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")
+    
+    try:
+        el_client = ElevenLabs(api_key=config.XI_KEY)
+        
+        audio_stream = el_client.text_to_speech.convert(
+            voice_id=config.ELEVENLABS_VOICE_ID,
+            text=text,
+            model_id=config.ELEVENLABS_MODEL,
+            voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
+        )
+        
+        filename = f"vo_{int(time.time())}.mp3"
+        local_path = config.TEMP_DIR / filename
+        
+        with open(local_path, "wb") as f:
+            for chunk in audio_stream:
+                f.write(chunk)
+        
+        r2_url = storage.upload_file(str(local_path))
+        return r2_url
+    except Exception as e:
+        logger.error(f"Voiceover failed: {e}")
+        return ""
+
+
+def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
+    """Generate combined voiceover for all scenes."""
+    voiceovers = []
+    for s in scenes:
+        vo = s.get("voiceover", "")
+        if vo and vo.strip() and not vo.startswith("("):
+            voiceovers.append(vo.strip())
+    
+    if not voiceovers:
+        return ""
+    
+    full_text = " ".join(voiceovers)
+    return generate_voiceover(full_text, style)
+
+
+# ============================================================
+# Audio Generation (Edge TTS - 免费中文语音合成)
+# ============================================================
+
+# Edge TTS 中文音色预设 (免费，效果好)
+EDGE_TTS_VOICES = {
+    # 女声
+    "sweet_female": "zh-CN-XiaoxiaoNeural",      # 晓晓 - 甜美活泼（推荐）
+    "gentle_female": "zh-CN-XiaoyiNeural",       # 晓伊 - 温柔知性
+    "lively_female": "zh-CN-XiaochenNeural",     # 晓辰 - 活泼可爱
+    "broadcast_female": "zh-CN-XiaoqiuNeural",   # 晓秋 - 新闻播报
+    # 男声
+    "general_male": "zh-CN-YunxiNeural",         # 云希 - 温暖男声
+    "broadcast_male": "zh-CN-YunjianNeural",     # 云健 - 专业播报
+}
+
+# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
+VOLC_TTS_VOICES = {
+    # 抖音带货友好女声
+    "sweet_female": "zh_female_vv_uranus_bigtts",   # viv 2.0 通用女声（甜美）
+    "lively_female": "zh_female_jitangnv_saturn_bigtts",  # 鸡汤女（元气）
+    "broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘（新闻播报）- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
+    "meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
+    # 男声
+    "general_male": "zh_male_dayi_saturn_bigtts",   # 大义（沉稳男声）
+}
+
+
+def generate_voiceover_edge(
+    text: str,
+    voice_type: str = "sweet_female",
+    rate: str = "+0%",
+    volume: str = "+0%",
+    output_path: str = None
+) -> str:
+    """
+    使用 Edge TTS 生成中文旁白（免费，效果好）
+    
+    Args:
+        text: 旁白文本
+        voice_type: 音色类型（见 EDGE_TTS_VOICES）或直接使用音色名
+        rate: 语速调整，如 "+10%", "-20%"
+        volume: 音量调整，如 "+10%", "-20%"
+        output_path: 输出路径
+    
+    Returns:
+        音频文件路径
+    """
+    import asyncio
+    import edge_tts
+    
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS")
+        return ""
+    
+    # 获取音色
+    voice = EDGE_TTS_VOICES.get(voice_type, voice_type)
+    
+    logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")
+    
+    if not output_path:
+        filename = f"vo_edge_{int(time.time())}.mp3"
+        output_path = str(config.TEMP_DIR / filename)
+    
+    async def _generate():
+        communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
+        await communicate.save(output_path)
+    
+    # Simple retry logic for Edge TTS
+    max_retries = 3
+    for i in range(max_retries):
+        try:
+            asyncio.run(_generate())
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                logger.info(f"Edge TTS voiceover generated: {output_path}")
+                return output_path
+        except Exception as e:
+            logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
+            time.sleep(1.0) # wait before retry
+            
+    logger.error("Edge TTS failed after retries.")
+    return ""
+
+
+def generate_voiceover_volcengine_ws(
+    text: str,
+    voice_type: str = "sweet_female",
+    output_path: str = None,
+    timeout: int = 120
+) -> str:
+    """
+    使用火山 WebSocket Binary Demo 生成 TTS 音频
+    依赖目录：/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python
+    """
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS (ws)")
+        return ""
+
+    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
+
+    venv_python = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python")
+    demo_script = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/examples/volcengine/binary.py")
+
+    if not venv_python.exists() or not demo_script.exists():
+        logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
+        return ""
+
+    if not output_path:
+        output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")
+
+    cmd = [
+        str(venv_python),
+        str(demo_script),
+        "--appid", config.VOLC_TTS_APPID,
+        "--access_token", config.VOLC_TTS_ACCESS_TOKEN,
+        "--voice_type", voice_id,
+        "--text", text,
+        "--encoding", "mp3",
+    ]
+
+    logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd="/Volumes/Tony/video-flow/volcengine_binary_demo",
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        if result.returncode != 0:
+            logger.error(f"Volc WS TTS failed: {result.stderr}")
+            return ""
+
+        # demo 保存在 cwd 下 voice_type.mp3
+        demo_out = Path("/Volumes/Tony/video-flow/volcengine_binary_demo") / f"{voice_id}.mp3"
+        if not demo_out.exists():
+            logger.error("Volc WS TTS output not found")
+            return ""
+
+        Path(output_path).write_bytes(demo_out.read_bytes())
+        logger.info(f"Volc WS TTS saved to {output_path}")
+        return output_path
+    except Exception as e:
+        logger.error(f"Volc WS TTS error: {e}")
+        return ""
+
+
+def generate_voiceover_volcengine(
+    text: str,
+    voice_type: str = "sweet_female",
+    speed_ratio: float = 1.0,
+    volume_ratio: float = 1.0,
+    pitch_ratio: float = 1.0,
+    output_path: str = None
+) -> str:
+    """
+    使用火山引擎 TTS 生成中文旁白
+    
+    Args:
+        text: 旁白文本
+        voice_type: 音色类型（见 VOLC_TTS_VOICES）或直接使用音色 ID
+        speed_ratio: 语速（0.5-2.0，默认1.0）
+        volume_ratio: 音量（0.5-2.0，默认1.0）
+        pitch_ratio: 音调（0.5-2.0，默认1.0）
+        output_path: 输出路径（可选，默认自动生成）
+    
+    Returns:
+        音频文件路径
+    """
+    import uuid
+    
+    if not text or not text.strip():
+        logger.warning("Empty text provided for TTS")
+        return ""
+    
+    # 获取音色 ID（火山音色表 + fallback 自定义）
+    voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
+    
+    logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")
+    
+    # 先尝试 WebSocket Binary（官方 demo 已验证可用）
+    ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
+    if ws_path:
+        return ws_path
+
+    # 若 WS 异常，再尝试 HTTP
+    url = "https://openspeech.bytedance.com/api/v1/tts"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
+    }
+
+    payload = {
+        "app": {
+            "appid": config.VOLC_TTS_APPID,
+            "token": config.VOLC_TTS_ACCESS_TOKEN,
+            "cluster": "volcano_tts"
+        },
+        "user": {
+            "uid": "video_flow_user"
+        },
+        "audio": {
+            "voice_type": voice_id,
+            "encoding": "mp3",
+            "speed_ratio": speed_ratio,
+            "volume_ratio": volume_ratio,
+            "pitch_ratio": pitch_ratio
+        },
+        "request": {
+            "reqid": str(uuid.uuid4()),
+            "text": text,
+            "text_type": "plain",
+            "operation": "query",
+            "with_timestamp": "1",
+            "extra_param": json.dumps({
+                "disable_markdown_filter": False
+            })
+        }
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=60)
+        
+        if response.status_code != 200:
+            logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
+            # Fallback to Edge TTS with a safe default voice
+            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+            
+        data = response.json()
+        
+        ret_code = data.get("code")
+        if ret_code not in (0, 3000, 20000000):
+            error_msg = data.get("message", "Unknown error")
+            logger.error(f"Volcengine TTS Error: {error_msg}")
+            # Fallback to Edge TTS with a safe default voice
+            fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+            return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+        
+        audio_data = data.get("data", "")
+        if not audio_data:
+            raise ValueError("No audio data returned")
+        
+        if not output_path:
+            filename = f"vo_volc_{int(time.time())}.mp3"
+            output_path = str(config.TEMP_DIR / filename)
+        
+        with open(output_path, "wb") as f:
+            f.write(base64.b64decode(audio_data))
+        
+        logger.info(f"Voiceover generated (HTTP): {output_path}")
+        return output_path
+        
+    except Exception as e:
+        logger.error(f"Volcengine TTS HTTP error: {e}")
+        # Fallback to Edge TTS with a safe default voice
+        fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
+        return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
+
+
+def generate_voiceover_volcengine_long(
+    text: str,
+    voice_type: str = "sweet_female",
+    speed_ratio: float = 1.0,
+    output_path: str = None,
+    max_chunk_length: int = 300
+) -> str:
+    """
+    火山引擎 TTS 长文本处理（自动分段合成）
+    
+    对于超过 max_chunk_length 的文本，自动分段合成后拼接
+    """
+    if len(text) <= max_chunk_length:
+        return generate_voiceover_volcengine(
+            text=text,
+            voice_type=voice_type,
+            speed_ratio=speed_ratio,
+            output_path=output_path
+        )
+    
+    logger.info(f"Long text ({len(text)} chars), splitting into chunks...")
+    
+    # 按句子分段
+    import re
+    sentences = re.split(r'([。！？；.!?;])', text)
+    
+    chunks = []
+    current_chunk = ""
+    
+    for i in range(0, len(sentences) - 1, 2):
+        sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
+        
+        if len(current_chunk) + len(sentence) <= max_chunk_length:
+            current_chunk += sentence
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = sentence
+    
+    if current_chunk:
+        chunks.append(current_chunk)
+    
+    # 如果最后一段是奇数句子
+    if len(sentences) % 2 == 1 and sentences[-1]:
+        if chunks:
+            chunks[-1] += sentences[-1]
+        else:
+            chunks.append(sentences[-1])
+    
+    logger.info(f"Split into {len(chunks)} chunks")
+    
+    # 生成每段音频
+    chunk_files = []
+    for i, chunk in enumerate(chunks):
+        chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
+        try:
+            path = generate_voiceover_volcengine(
+                text=chunk,
+                voice_type=voice_type,
+                speed_ratio=speed_ratio,
+                output_path=chunk_path
+            )
+            chunk_files.append(path)
+        except Exception as e:
+            logger.error(f"Chunk {i} failed: {e}")
+            # 继续处理其他段落
+    
+    if not chunk_files:
+        raise ValueError("All TTS chunks failed")
+    
+    # 使用 FFmpeg 合并音频
+    if len(chunk_files) == 1:
+        if output_path:
+            import shutil
+            shutil.move(chunk_files[0], output_path)
+            return output_path
+        return chunk_files[0]
+    
+    # 创建合并文件列表
+    concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
+    with open(concat_list, "w") as f:
+        for cf in chunk_files:
+            f.write(f"file '{cf}'\n")
+    
+    if not output_path:
+        output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")
+    
+    # FFmpeg 合并
+    import subprocess
+    cmd = [
+        "ffmpeg", "-y",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", str(concat_list),
+        "-c", "copy",
+        output_path
+    ]
+    
+    subprocess.run(cmd, capture_output=True, check=True)
+    
+    # 清理临时文件
+    for cf in chunk_files:
+        try:
+            os.remove(cf)
+        except:
+            pass
+    concat_list.unlink(missing_ok=True)
+    
+    logger.info(f"Merged voiceover: {output_path}")
+    return output_path
+
+
+def generate_scene_voiceovers_volcengine(
+    scenes: List[Dict[str, Any]],
+    voice_type: str = "sweet_female",
+    output_dir: str = None
+) -> List[str]:
+    """
+    为每个场景单独生成旁白音频
+    
+    Args:
+        scenes: 场景列表，每个场景包含 voiceover 字段
+        voice_type: 音色类型
+        output_dir: 输出目录
+    
+    Returns:
+        音频文件路径列表
+    """
+    if output_dir:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True)
+    else:
+        output_dir = config.TEMP_DIR
+    
+    audio_paths = []
+    
+    for i, scene in enumerate(scenes):
+        vo_text = scene.get("voiceover", "")
+        
+        if not vo_text or not vo_text.strip() or vo_text.startswith("("):
+            # 无旁白或是注释
+            audio_paths.append("")
+            continue
+        
+        try:
+            output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
+            path = generate_voiceover_volcengine(
+                text=vo_text.strip(),
+                voice_type=voice_type,
+                output_path=output_path
+            )
+            audio_paths.append(path)
+        except Exception as e:
+            logger.error(f"Scene {i+1} voiceover failed: {e}")
+            audio_paths.append("")
+    
+    return audio_paths
--- a/modules/fancy_text.py
+++ b/modules/fancy_text.py
@@ -0,0 +1,708 @@
+"""
+抖音风格花字生成模块
+使用 Pillow 生成透明 PNG 图片，支持描边、渐变、气泡框等效果
+"""
+import os
+import hashlib
+import logging
+from pathlib import Path
+from typing import Dict, Any, Tuple, List, Optional
+
+from PIL import Image, ImageDraw, ImageFont, ImageFilter
+
+import config
+
+logger = logging.getLogger(__name__)
+
+# 花字缓存目录
+FANCY_TEXT_CACHE_DIR = config.TEMP_DIR / "fancy_text_cache"
+FANCY_TEXT_CACHE_DIR.mkdir(exist_ok=True)
+
+
+def _get_font(font_name: str = None, size: int = 48) -> ImageFont.FreeTypeFont:
+    """获取字体对象，遇到无效字体会继续尝试下一候选，最后才降级为默认字体"""
+    candidates = []
+    if font_name and os.path.exists(font_name):
+        candidates.append(font_name)
+    else:
+        candidates.extend([
+            config.FONTS_DIR / "AlibabaPuHuiTi-Bold.ttf",
+            config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf",
+            config.FONTS_DIR / "NotoSansSC-Bold.otf",
+            config.FONTS_DIR / "NotoSansSC-Regular.otf",
+        ])
+        candidates.extend([
+            "/System/Library/Fonts/PingFang.ttc",
+            "/System/Library/Fonts/STHeiti Medium.ttc",
+            "/Library/Fonts/Arial Unicode.ttf",
+            "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
+            "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
+            "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
+            "C:/Windows/Fonts/msyh.ttc",
+            "C:/Windows/Fonts/simhei.ttf",
+        ])
+    for path in candidates:
+        if not path:
+            continue
+        p = str(path)
+        if not os.path.exists(p):
+            continue
+        if isinstance(path, Path) and path.stat().st_size < 10000:
+            continue
+        try:
+            return ImageFont.truetype(p, size)
+        except Exception as e:
+            logger.warning(f"Failed to load font {p}: {e}")
+            continue
+    logger.warning("No suitable font found, using default")
+    return ImageFont.load_default()
+
+
+def _hex_to_rgb(hex_color: str) -> Tuple[int, int, int]:
+    """十六进制颜色转 RGB"""
+    hex_color = hex_color.lstrip("#")
+    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+
+
+def _get_text_size(text: str, font: ImageFont.FreeTypeFont) -> Tuple[int, int]:
+    """获取文字尺寸"""
+    # 创建临时图像来测量文字
+    dummy_img = Image.new("RGBA", (1, 1))
+    draw = ImageDraw.Draw(dummy_img)
+    bbox = draw.textbbox((0, 0), text, font=font)
+    return bbox[2] - bbox[0], bbox[3] - bbox[1]
+
+
+def _cache_key(text: str, style: Dict) -> str:
+    """生成缓存键"""
+    content = f"{text}_{str(sorted(style.items()))}"
+    return hashlib.md5(content.encode()).hexdigest()
+
+
+def create_text_with_stroke(
+    text: str,
+    font_size: int = 60,
+    font_color: str = "#FFFFFF",
+    stroke_color: str = "#000000",
+    stroke_width: int = 4,
+    font_name: str = None,
+    padding: int = 20
+) -> Image.Image:
+    """
+    创建带描边的文字图片
+    
+    Args:
+        text: 文字内容
+        font_size: 字体大小
+        font_color: 字体颜色（十六进制）
+        stroke_color: 描边颜色
+        stroke_width: 描边宽度
+        font_name: 字体路径
+        padding: 内边距
+    
+    Returns:
+        透明 PNG 图片
+    """
+    font = _get_font(font_name, font_size)
+    text_w, text_h = _get_text_size(text, font)
+    
+    # 图片尺寸（加上描边和内边距）
+    img_w = text_w + stroke_width * 2 + padding * 2
+    img_h = text_h + stroke_width * 2 + padding * 2
+    
+    # 创建透明图片
+    img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    
+    # 文字位置
+    x = padding + stroke_width
+    y = padding + stroke_width
+    
+    # 绘制描边（通过偏移绘制多次）
+    stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
+    for dx in range(-stroke_width, stroke_width + 1):
+        for dy in range(-stroke_width, stroke_width + 1):
+            if dx * dx + dy * dy <= stroke_width * stroke_width:
+                draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
+    
+    # 绘制主文字
+    font_rgb = _hex_to_rgb(font_color) + (255,)
+    draw.text((x, y), text, font=font, fill=font_rgb)
+    
+    return img
+
+
+def create_text_with_shadow(
+    text: str,
+    font_size: int = 60,
+    font_color: str = "#FFFFFF",
+    shadow_color: str = "#000000",
+    shadow_offset: Tuple[int, int] = (4, 4),
+    shadow_blur: int = 5,
+    font_name: str = None,
+    padding: int = 30,
+    stroke_color: str = None,
+    stroke_width: int = 0
+) -> Image.Image:
+    """
+    创建带阴影的文字图片，可选描边（用于双层安全描边）
+    """
+    font = _get_font(font_name, font_size)
+    text_w, text_h = _get_text_size(text, font)
+    
+    # 图片尺寸
+    extra = max(shadow_blur, stroke_width * 2)
+    img_w = text_w + abs(shadow_offset[0]) + extra * 2 + padding * 2
+    img_h = text_h + abs(shadow_offset[1]) + extra * 2 + padding * 2
+    
+    shadow_img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    shadow_draw = ImageDraw.Draw(shadow_img)
+    
+    x = padding + extra
+    y = padding + extra
+    
+    # 阴影
+    shadow_rgb = _hex_to_rgb(shadow_color) + (180,)
+    shadow_draw.text((x + shadow_offset[0], y + shadow_offset[1]), text, font=font, fill=shadow_rgb)
+    shadow_img = shadow_img.filter(ImageFilter.GaussianBlur(shadow_blur))
+    
+    draw = ImageDraw.Draw(shadow_img)
+    
+    # 可选描边（外层深色或浅色）
+    if stroke_color and stroke_width > 0:
+        stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
+        for dx in range(-stroke_width, stroke_width + 1):
+            for dy in range(-stroke_width, stroke_width + 1):
+                if dx * dx + dy * dy <= stroke_width * stroke_width:
+                    draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
+    
+    # 主文字
+    font_rgb = _hex_to_rgb(font_color) + (255,)
+    draw.text((x, y), text, font=font, fill=font_rgb)
+    
+    return shadow_img
+
+
+def create_text_with_gradient(
+    text: str,
+    font_size: int = 60,
+    gradient_colors: List[str] = None,
+    gradient_direction: str = "vertical",  # vertical, horizontal
+    stroke_color: str = "#000000",
+    stroke_width: int = 3,
+    font_name: str = None,
+    padding: int = 20
+) -> Image.Image:
+    """
+    创建渐变色文字图片
+    
+    Args:
+        gradient_colors: 渐变颜色列表，如 ["#FF6B6B", "#FFE66D"]
+        gradient_direction: 渐变方向
+    """
+    if not gradient_colors:
+        gradient_colors = ["#FF6B6B", "#FFE66D"]  # 默认红黄渐变
+    
+    font = _get_font(font_name, font_size)
+    text_w, text_h = _get_text_size(text, font)
+    
+    img_w = text_w + stroke_width * 2 + padding * 2
+    img_h = text_h + stroke_width * 2 + padding * 2
+    
+    # 创建渐变图层
+    gradient = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    gradient_draw = ImageDraw.Draw(gradient)
+    
+    # 生成渐变
+    colors = [_hex_to_rgb(c) for c in gradient_colors]
+    
+    for i in range(img_h if gradient_direction == "vertical" else img_w):
+        ratio = i / (img_h if gradient_direction == "vertical" else img_w)
+        # 线性插值颜色
+        if ratio < 0.5:
+            r = ratio * 2
+            c1, c2 = colors[0], colors[min(1, len(colors) - 1)]
+        else:
+            r = (ratio - 0.5) * 2
+            c1 = colors[min(1, len(colors) - 1)]
+            c2 = colors[min(2, len(colors) - 1)] if len(colors) > 2 else c1
+        
+        color = tuple(int(c1[j] + (c2[j] - c1[j]) * r) for j in range(3)) + (255,)
+        
+        if gradient_direction == "vertical":
+            gradient_draw.line([(0, i), (img_w, i)], fill=color)
+        else:
+            gradient_draw.line([(i, 0), (i, img_h)], fill=color)
+    
+    # 创建文字蒙版
+    mask = Image.new("L", (img_w, img_h), 0)
+    mask_draw = ImageDraw.Draw(mask)
+    
+    x = padding + stroke_width
+    y = padding + stroke_width
+    
+    # 先绘制描边蒙版
+    for dx in range(-stroke_width, stroke_width + 1):
+        for dy in range(-stroke_width, stroke_width + 1):
+            if dx * dx + dy * dy <= stroke_width * stroke_width:
+                mask_draw.text((x + dx, y + dy), text, font=font, fill=128)
+    
+    # 主文字蒙版
+    mask_draw.text((x, y), text, font=font, fill=255)
+    
+    # 创建结果图片
+    result = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    
+    # 绘制描边
+    stroke_img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    stroke_draw = ImageDraw.Draw(stroke_img)
+    stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
+    
+    for dx in range(-stroke_width, stroke_width + 1):
+        for dy in range(-stroke_width, stroke_width + 1):
+            if dx * dx + dy * dy <= stroke_width * stroke_width:
+                stroke_draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
+    
+    result = Image.alpha_composite(result, stroke_img)
+    
+    # 应用渐变到文字
+    text_mask = Image.new("L", (img_w, img_h), 0)
+    ImageDraw.Draw(text_mask).text((x, y), text, font=font, fill=255)
+    
+    gradient_text = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    gradient_text.paste(gradient, mask=text_mask)
+    
+    result = Image.alpha_composite(result, gradient_text)
+    
+    return result
+
+
+def create_bubble_text(
+    text: str,
+    font_size: int = 48,
+    font_color: str = "#333333",
+    bg_color: str = "#FFFFFF",
+    border_color: str = "#CCCCCC",
+    border_width: int = 2,
+    corner_radius: int = 20,
+    padding: Tuple[int, int] = (30, 15),
+    font_name: str = None,
+    tail_direction: str = None  # "left", "right", "bottom", None
+) -> Image.Image:
+    """
+    创建气泡框文字（对话框效果）
+    
+    Args:
+        tail_direction: 气泡尾巴方向
+    """
+    font = _get_font(font_name, font_size)
+    text_w, text_h = _get_text_size(text, font)
+    
+    # 气泡尺寸
+    bubble_w = text_w + padding[0] * 2
+    bubble_h = text_h + padding[1] * 2
+    
+    # 增加尾巴空间
+    tail_size = 20 if tail_direction else 0
+    
+    if tail_direction in ["left", "right"]:
+        img_w = bubble_w + tail_size
+        img_h = bubble_h
+    elif tail_direction == "bottom":
+        img_w = bubble_w
+        img_h = bubble_h + tail_size
+    else:
+        img_w = bubble_w
+        img_h = bubble_h
+    
+    img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    
+    # 气泡位置
+    if tail_direction == "left":
+        bx = tail_size
+    else:
+        bx = 0
+    by = 0
+    
+    # 绘制圆角矩形
+    bg_rgb = _hex_to_rgb(bg_color) + (255,)
+    border_rgb = _hex_to_rgb(border_color) + (255,)
+    
+    # 使用圆角矩形
+    draw.rounded_rectangle(
+        [bx, by, bx + bubble_w, by + bubble_h],
+        radius=corner_radius,
+        fill=bg_rgb,
+        outline=border_rgb,
+        width=border_width
+    )
+    
+    # 绘制尾巴
+    if tail_direction == "left":
+        points = [
+            (bx, bubble_h // 2 - 10),
+            (0, bubble_h // 2),
+            (bx, bubble_h // 2 + 10)
+        ]
+        draw.polygon(points, fill=bg_rgb, outline=border_rgb)
+        # 覆盖边框内部分
+        draw.polygon(points, fill=bg_rgb)
+    elif tail_direction == "right":
+        points = [
+            (bx + bubble_w, bubble_h // 2 - 10),
+            (img_w, bubble_h // 2),
+            (bx + bubble_w, bubble_h // 2 + 10)
+        ]
+        draw.polygon(points, fill=bg_rgb, outline=border_rgb)
+        draw.polygon(points, fill=bg_rgb)
+    elif tail_direction == "bottom":
+        points = [
+            (bubble_w // 2 - 10, bubble_h),
+            (bubble_w // 2, img_h),
+            (bubble_w // 2 + 10, bubble_h)
+        ]
+        draw.polygon(points, fill=bg_rgb, outline=border_rgb)
+        draw.polygon(points, fill=bg_rgb)
+    
+    # 绘制文字
+    font_rgb = _hex_to_rgb(font_color) + (255,)
+    text_x = bx + padding[0]
+    text_y = by + padding[1]
+    draw.text((text_x, text_y), text, font=font, fill=font_rgb)
+    
+    return img
+
+
+def create_price_tag(
+    price: str,
+    currency: str = "¥",
+    font_size: int = 72,
+    price_color: str = "#FF4444",
+    currency_color: str = "#FF4444",
+    stroke_color: str = "#FFFFFF",
+    stroke_width: int = 4,
+    font_name: str = None
+) -> Image.Image:
+    """
+    创建价格标签（电商风格）
+    """
+    font_large = _get_font(font_name, font_size)
+    font_small = _get_font(font_name, int(font_size * 0.5))
+    
+    # 测量尺寸
+    currency_w, currency_h = _get_text_size(currency, font_small)
+    price_w, price_h = _get_text_size(price, font_large)
+    
+    total_w = currency_w + price_w + 5
+    total_h = max(currency_h, price_h)
+    
+    padding = stroke_width + 10
+    img_w = total_w + padding * 2
+    img_h = total_h + padding * 2
+    
+    img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    
+    # 绘制描边
+    stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
+    for dx in range(-stroke_width, stroke_width + 1):
+        for dy in range(-stroke_width, stroke_width + 1):
+            if dx * dx + dy * dy <= stroke_width * stroke_width:
+                # 货币符号
+                draw.text(
+                    (padding + dx, padding + (total_h - currency_h) // 2 + dy),
+                    currency, font=font_small, fill=stroke_rgb
+                )
+                # 价格
+                draw.text(
+                    (padding + currency_w + 5 + dx, padding + (total_h - price_h) // 2 + dy),
+                    price, font=font_large, fill=stroke_rgb
+                )
+    
+    # 绘制文字
+    currency_rgb = _hex_to_rgb(currency_color) + (255,)
+    price_rgb = _hex_to_rgb(price_color) + (255,)
+    
+    draw.text(
+        (padding, padding + (total_h - currency_h) // 2),
+        currency, font=font_small, fill=currency_rgb
+    )
+    draw.text(
+        (padding + currency_w + 5, padding + (total_h - price_h) // 2),
+        price, font=font_large, fill=price_rgb
+    )
+    
+    return img
+
+
+def create_button(
+    text: str,
+    font_size: int = 36,
+    font_color: str = "#FFFFFF",
+    bg_color: str = "#FF6B35",
+    corner_radius: int = 25,
+    padding: Tuple[int, int] = (40, 15),
+    font_name: str = None,
+    shadow: bool = True
+) -> Image.Image:
+    """
+    创建按钮样式文字（如"立即抢购"）
+    """
+    font = _get_font(font_name, font_size)
+    text_w, text_h = _get_text_size(text, font)
+    
+    btn_w = text_w + padding[0] * 2
+    btn_h = text_h + padding[1] * 2
+    
+    shadow_offset = 4 if shadow else 0
+    img_w = btn_w + shadow_offset
+    img_h = btn_h + shadow_offset
+    
+    img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    
+    # 绘制阴影
+    if shadow:
+        shadow_color = (0, 0, 0, 80)
+        draw.rounded_rectangle(
+            [shadow_offset, shadow_offset, btn_w + shadow_offset, btn_h + shadow_offset],
+            radius=corner_radius,
+            fill=shadow_color
+        )
+    
+    # 绘制按钮背景
+    bg_rgb = _hex_to_rgb(bg_color) + (255,)
+    draw.rounded_rectangle(
+        [0, 0, btn_w, btn_h],
+        radius=corner_radius,
+        fill=bg_rgb
+    )
+    
+    # 绘制文字
+    font_rgb = _hex_to_rgb(font_color) + (255,)
+    text_x = padding[0]
+    text_y = padding[1]
+    draw.text((text_x, text_y), text, font=font, fill=font_rgb)
+    
+    return img
+
+
+def create_comparison_text(
+    left_text: str,
+    right_text: str,
+    vs_text: str = "vs",
+    font_size: int = 48,
+    left_color: str = "#666666",
+    right_color: str = "#FF6B35",
+    vs_color: str = "#FF0000",
+    font_name: str = None
+) -> Image.Image:
+    """
+    创建对比文字（如"塌马尾 vs 高颅顶"）
+    """
+    font = _get_font(font_name, font_size)
+    font_vs = _get_font(font_name, int(font_size * 0.8))
+    
+    left_w, left_h = _get_text_size(left_text, font)
+    vs_w, vs_h = _get_text_size(vs_text, font_vs)
+    right_w, right_h = _get_text_size(right_text, font)
+    
+    spacing = 15
+    total_w = left_w + vs_w + right_w + spacing * 2
+    total_h = max(left_h, vs_h, right_h)
+    
+    padding = 20
+    stroke_width = 3
+    img_w = total_w + padding * 2 + stroke_width * 2
+    img_h = total_h + padding * 2 + stroke_width * 2
+    
+    img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    
+    x = padding + stroke_width
+    y = padding + stroke_width
+    
+    # 描边
+    stroke_color = (0, 0, 0, 255)
+    for dx in range(-stroke_width, stroke_width + 1):
+        for dy in range(-stroke_width, stroke_width + 1):
+            if dx * dx + dy * dy <= stroke_width * stroke_width:
+                draw.text((x + dx, y + (total_h - left_h) // 2 + dy), left_text, font=font, fill=stroke_color)
+                draw.text((x + left_w + spacing + dx, y + (total_h - vs_h) // 2 + dy), vs_text, font=font_vs, fill=stroke_color)
+                draw.text((x + left_w + spacing + vs_w + spacing + dx, y + (total_h - right_h) // 2 + dy), right_text, font=font, fill=stroke_color)
+    
+    # 绘制文字
+    left_rgb = _hex_to_rgb(left_color) + (255,)
+    vs_rgb = _hex_to_rgb(vs_color) + (255,)
+    right_rgb = _hex_to_rgb(right_color) + (255,)
+    
+    draw.text((x, y + (total_h - left_h) // 2), left_text, font=font, fill=left_rgb)
+    draw.text((x + left_w + spacing, y + (total_h - vs_h) // 2), vs_text, font=font_vs, fill=vs_rgb)
+    draw.text((x + left_w + spacing + vs_w + spacing, y + (total_h - right_h) // 2), right_text, font=font, fill=right_rgb)
+    
+    return img
+
+
+# ============================================================
+# 预设样式
+# ============================================================
+
+PRESET_STYLES = {
+    "subtitle": {
+        "font_size": 48,
+        "font_color": "#FFFFFF",
+        "stroke_color": "#000000",
+        "stroke_width": 3,
+        "version": "v2"
+    },
+    "highlight": {
+        # 暖米白主色 + 浅描边 + 暗色阴影，匹配浅棕背景
+        "font_size": 90,
+        "font_color": "#F7E7D3",
+        "stroke_color": "#C9B59A",  # 浅描边
+        "stroke_width": 4,
+        "type": "shadow",
+        "shadow_color": "#3A2C1F",  # 暗棕阴影
+        "shadow_offset": (3, 3),
+        "shadow_blur": 10,
+        "padding": 32,
+        "version": "gloda"
+    },
+    "warning": {
+        # 低饱和陶土红 + 米色描边 + 暗棕阴影
+        "font_size": 80,
+        "font_color": "#D96B4F",
+        "stroke_color": "#F6E5D6",
+        "stroke_width": 4,
+        "type": "shadow",
+        "shadow_color": "#3A2C1F",
+        "shadow_offset": (3, 3),
+        "shadow_blur": 10,
+        "padding": 30,
+        "version": "gloda"
+    },
+    "success": {
+        "font_size": 52,
+        "font_color": "#4CAF50",
+        "stroke_color": "#FFFFFF",
+        "stroke_width": 4,
+        "version": "v2"
+    },
+    "price": {
+        # 价格标签：温暖红 + 米白货币符号 + 暗描边
+        "font_size": 110,
+        "price_color": "#E25B4F",
+        "currency_color": "#F6E5D6",
+        "stroke_color": "#3A2C1F",
+        "stroke_width": 8,
+        "type": "price",
+        "version": "gloda"
+    },
+    "cta_button": {
+        # 暖橙按钮，轻阴影
+        "font_size": 46,
+        "font_color": "#FFFFFF",
+        "bg_color": "#E6763A",
+        "corner_radius": 32,
+        "type": "button",
+        "shadow": True,
+        "version": "gloda"
+    }
+}
+
+
+def create_fancy_text(
+    text: str,
+    style: str = "subtitle",
+    custom_style: Dict[str, Any] = None,
+    cache: bool = True
+) -> str:
+    """
+    创建花字图片的统一入口
+    
+    Args:
+        text: 文字内容
+        style: 预设样式名称
+        custom_style: 自定义样式（覆盖预设）
+        cache: 是否缓存
+    
+    Returns:
+        PNG 图片路径
+    """
+    # 合并样式
+    base_style = PRESET_STYLES.get(style, PRESET_STYLES["subtitle"]).copy()
+    if custom_style:
+        base_style.update(custom_style)
+    
+    # 检查缓存
+    if cache:
+        cache_name = _cache_key(text, base_style)
+        cache_path = FANCY_TEXT_CACHE_DIR / f"{cache_name}.png"
+        if cache_path.exists():
+            return str(cache_path)
+    
+    # 根据样式类型创建图片
+    style_type = base_style.pop("type", None)
+    
+    if style == "price" or style_type == "price":
+        img = create_price_tag(text, **{k: v for k, v in base_style.items() if k in [
+            "currency", "font_size", "price_color", "currency_color", "stroke_color", "stroke_width", "font_name"
+        ]})
+    elif style == "cta_button" or style_type == "button":
+        img = create_button(text, **{k: v for k, v in base_style.items() if k in [
+            "font_size", "font_color", "bg_color", "corner_radius", "padding", "font_name", "shadow"
+        ]})
+    elif style_type == "bubble":
+        img = create_bubble_text(text, **{k: v for k, v in base_style.items() if k in [
+            "font_size", "font_color", "bg_color", "border_color", "border_width", 
+            "corner_radius", "padding", "font_name", "tail_direction"
+        ]})
+    elif style_type == "gradient":
+        img = create_text_with_gradient(text, **{k: v for k, v in base_style.items() if k in [
+            "font_size", "gradient_colors", "gradient_direction", "stroke_color", "stroke_width", "font_name", "padding"
+        ]})
+    elif style_type == "shadow":
+        img = create_text_with_shadow(text, **{k: v for k, v in base_style.items() if k in [
+            "font_size", "font_color", "shadow_color", "shadow_offset", "shadow_blur", "font_name", "padding"
+        ]})
+    else:
+        # 默认带描边文字
+        img = create_text_with_stroke(text, **{k: v for k, v in base_style.items() if k in [
+            "font_size", "font_color", "stroke_color", "stroke_width", "font_name", "padding"
+        ]})
+    
+    # 保存
+    if cache:
+        output_path = str(cache_path)
+    else:
+        output_path = str(config.TEMP_DIR / f"fancy_{hash(text)}_{os.getpid()}.png")
+    
+    img.save(output_path, "PNG")
+    logger.info(f"Created fancy text: '{text[:20]}...' -> {output_path}")
+    
+    return output_path
+
+
+def batch_create_fancy_texts(
+    configs: List[Dict[str, Any]]
+) -> List[str]:
+    """
+    批量创建花字图片
+    
+    Args:
+        configs: 配置列表 [{text, style, custom_style}]
+    
+    Returns:
+        PNG 图片路径列表
+    """
+    paths = []
+    for cfg in configs:
+        path = create_fancy_text(
+            text=cfg.get("text", ""),
+            style=cfg.get("style", "subtitle"),
+            custom_style=cfg.get("custom_style")
+        )
+        paths.append(path)
+    return paths
+
--- a/modules/ffmpeg_utils.py
+++ b/modules/ffmpeg_utils.py
@@ -0,0 +1,960 @@
+"""
+FFmpeg 视频处理工具模块
+支持规模化批量视频处理：拼接、字幕、叠加、混音
+"""
+import os
+import re
+import subprocess
+import tempfile
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+
+import config
+
+logger = logging.getLogger(__name__)
+
+# FFmpeg/FFprobe 路径（优先使用项目内的二进制）
+FFMPEG_PATH = str(config.BASE_DIR / "bin" / "ffmpeg") if (config.BASE_DIR / "bin" / "ffmpeg").exists() else "ffmpeg"
+FFPROBE_PATH = str(config.BASE_DIR / "bin" / "ffprobe") if (config.BASE_DIR / "bin" / "ffprobe").exists() else "ffprobe"
+
+# 字体路径优先使用项目自带中文字体，其次使用 Linux 系统字体，最后再回退到 macOS 路径
+DEFAULT_FONT_PATHS = [
+    # 优先使用 Linux 系统级中文字体 (服务器环境最稳健)
+    "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
+    "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc",
+    
+    # 项目内字体 (注意：需确保文件不是 LFS 指针)
+    str(config.FONTS_DIR / "HarmonyOS-Sans-SC-Regular.ttf"),
+    str(config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf"),
+    
+    # macOS 字体（仅本地调试生效）
+    "/System/Library/Fonts/PingFang.ttc",
+    "/System/Library/Fonts/STHeiti Medium.ttc",
+    "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
+]
+
+
+def _get_font_path() -> str:
+    for p in DEFAULT_FONT_PATHS:
+        if os.path.exists(p) and os.path.getsize(p) > 1000:
+            return p
+    return "Arial"  # 极端情况下退回英文字体，避免崩溃
+
+
+def _sanitize_text(text: str) -> str:
+    """
+    去除可能导致 ffmpeg 命令行错误的特殊控制字符，但保留 Emoji、数字、标点和各国语言。
+    """
+    if not text:
+        return ""
+    
+    # 不再过滤任何字符，只确保不是 None
+    return text
+
+
+def add_silence_audio(video_path: str, output_path: str) -> str:
+    """
+    给无音轨的视频补一条静音轨（立体声 44.1k），避免后续 filter 找不到 0:a
+    """
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-f", "lavfi",
+        "-i", "anullsrc=channel_layout=stereo:sample_rate=44100",
+        "-shortest",
+        "-c:v", "copy",
+        "-c:a", "aac",
+        output_path
+    ]
+    _run_ffmpeg(cmd)
+    return output_path
+
+
+def _run_ffmpeg(cmd: List[str], check: bool = True) -> subprocess.CompletedProcess:
+    """执行 FFmpeg 命令"""
+    logger.debug(f"FFmpeg command: {' '.join(cmd)}")
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=check
+        )
+        # 无论成功失败，输出 stderr 以便排查字体等警告
+        if result.stderr:
+            print(f"[FFmpeg stderr] {result.stderr}", flush=True)
+        if result.returncode != 0:
+            logger.error(f"FFmpeg stderr: {result.stderr}")
+        return result
+    except subprocess.CalledProcessError as e:
+        logger.error(f"FFmpeg failed: {e.stderr}")
+        raise
+
+
+def get_video_info(video_path: str) -> Dict[str, Any]:
+    """获取视频信息（时长、分辨率、帧率等）"""
+    cmd = [
+        FFPROBE_PATH,
+        "-v", "quiet",
+        "-print_format", "json",
+        "-show_format",
+        "-show_streams",
+        video_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise ValueError(f"Failed to probe video: {video_path}")
+    
+    import json
+    data = json.loads(result.stdout)
+    
+    # 提取关键信息
+    info = {
+        "duration": float(data.get("format", {}).get("duration", 0)),
+        "width": 0,
+        "height": 0,
+        "fps": 30
+    }
+    
+    for stream in data.get("streams", []):
+        if stream.get("codec_type") == "video":
+            info["width"] = stream.get("width", 0)
+            info["height"] = stream.get("height", 0)
+            # 解析帧率 (如 "30/1" 或 "29.97")
+            fps_str = stream.get("r_frame_rate", "30/1")
+            if "/" in fps_str:
+                num, den = fps_str.split("/")
+                info["fps"] = float(num) / float(den) if float(den) != 0 else 30
+            else:
+                info["fps"] = float(fps_str)
+            break
+    
+    return info
+
+
+def concat_videos(
+    video_paths: List[str],
+    output_path: str,
+    target_size: Tuple[int, int] = (1080, 1920)
+) -> str:
+    """
+    使用 FFmpeg concat demuxer 拼接多段视频
+    
+    Args:
+        video_paths: 视频文件路径列表
+        output_path: 输出文件路径
+        target_size: 目标分辨率 (width, height)，默认竖屏 1080x1920
+    
+    Returns:
+        输出文件路径
+    """
+    if not video_paths:
+        raise ValueError("No video paths provided")
+    
+    logger.info(f"Concatenating {len(video_paths)} videos...")
+    
+    # 创建 concat 文件列表
+    concat_file = config.TEMP_DIR / f"concat_{os.getpid()}.txt"
+    
+    with open(concat_file, "w", encoding="utf-8") as f:
+        for vp in video_paths:
+            # 使用绝对路径并转义单引号
+            abs_path = os.path.abspath(vp)
+            f.write(f"file '{abs_path}'\n")
+    
+    width, height = target_size
+    
+    # 使用 filter_complex 统一分辨率后拼接
+    # 每个视频先 scale + pad 到目标尺寸
+    filter_parts = []
+    for i in range(len(video_paths)):
+        # scale 保持宽高比，pad 填充黑边居中
+        filter_parts.append(
+            f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
+            f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1[v{i}]"
+        )
+    
+    # 拼接所有视频流
+    concat_inputs = "".join([f"[v{i}]" for i in range(len(video_paths))])
+    filter_parts.append(f"{concat_inputs}concat=n={len(video_paths)}:v=1:a=0[outv]")
+    
+    filter_complex = ";".join(filter_parts)
+    
+    # 构建 ffmpeg 命令
+    cmd = [FFMPEG_PATH, "-y"]
+    for vp in video_paths:
+        cmd.extend(["-i", vp])
+    
+    cmd.extend([
+        "-filter_complex", filter_complex,
+        "-map", "[outv]",
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ])
+    
+    _run_ffmpeg(cmd)
+    
+    # 清理临时文件
+    if concat_file.exists():
+        concat_file.unlink()
+    
+    logger.info(f"Concatenated video saved: {output_path}")
+    return output_path
+
+
+def concat_videos_with_audio(
+    video_paths: List[str],
+    output_path: str,
+    target_size: Tuple[int, int] = (1080, 1920)
+) -> str:
+    """
+    拼接视频并保留音频轨道
+    """
+    if not video_paths:
+        raise ValueError("No video paths provided")
+    
+    logger.info(f"Concatenating {len(video_paths)} videos with audio...")
+    
+    width, height = target_size
+    n = len(video_paths)
+    
+    # 构建 filter_complex
+    filter_parts = []
+    
+    # 视频处理
+    for i in range(n):
+        filter_parts.append(
+            f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
+            f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1[v{i}]"
+        )
+    
+    # 音频处理（静音填充如果没有音频）
+    for i in range(n):
+        filter_parts.append(f"[{i}:a]aformat=sample_rates=44100:channel_layouts=stereo[a{i}]")
+    
+    # 拼接
+    v_concat = "".join([f"[v{i}]" for i in range(n)])
+    a_concat = "".join([f"[a{i}]" for i in range(n)])
+    filter_parts.append(f"{v_concat}concat=n={n}:v=1:a=0[outv]")
+    filter_parts.append(f"{a_concat}concat=n={n}:v=0:a=1[outa]")
+    
+    filter_complex = ";".join(filter_parts)
+    
+    cmd = [FFMPEG_PATH, "-y"]
+    for vp in video_paths:
+        cmd.extend(["-i", vp])
+    
+    cmd.extend([
+        "-filter_complex", filter_complex,
+        "-map", "[outv]",
+        "-map", "[outa]",
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "aac",
+        "-b:a", "128k",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ])
+    
+    try:
+        _run_ffmpeg(cmd)
+    except subprocess.CalledProcessError:
+        # 如果音频拼接失败，回退到无音频版本
+        logger.warning("Audio concat failed, falling back to video only")
+        return concat_videos(video_paths, output_path, target_size)
+    
+    logger.info(f"Concatenated video with audio saved: {output_path}")
+    return output_path
+
+
+def add_subtitle(
+    video_path: str,
+    text: str,
+    start: float,
+    duration: float,
+    output_path: str,
+    style: Dict[str, Any] = None
+) -> str:
+    """
+    使用 drawtext filter 添加单条字幕
+    
+    Args:
+        video_path: 输入视频路径
+        text: 字幕文本
+        start: 开始时间（秒）
+        duration: 持续时间（秒）
+        output_path: 输出路径
+        style: 样式配置 {
+            fontsize: 字体大小,
+            fontcolor: 字体颜色,
+            borderw: 描边宽度,
+            bordercolor: 描边颜色,
+            x: x位置 (可用表达式如 "(w-text_w)/2"),
+            y: y位置,
+            font: 字体路径或名称
+        }
+    
+    Returns:
+        输出文件路径
+    """
+    style = style or {}
+    
+    # 默认样式
+    fontsize = style.get("fontsize", 48)
+    fontcolor = style.get("fontcolor", "white")
+    borderw = style.get("borderw", 3)
+    bordercolor = style.get("bordercolor", "black")
+    x = style.get("x", "(w-text_w)/2")  # 默认水平居中
+    y = style.get("y", "h-200")  # 默认底部偏上
+    
+    # 优先使用动态检测到的有效字体，而不是硬编码的可能损坏的路径
+    default_font_path = _get_font_path()
+    font = style.get("font", default_font_path)
+    
+    # 转义特殊字符
+    escaped_text = text.replace("'", "\\'").replace(":", "\\:")
+    
+    # drawtext filter
+    drawtext = (
+        f"drawtext=text='{escaped_text}':"
+        f"fontfile='{font}':"
+        f"fontsize={fontsize}:"
+        f"fontcolor={fontcolor}:"
+        f"borderw={borderw}:"
+        f"bordercolor={bordercolor}:"
+        f"x={x}:y={y}:"
+        f"enable='between(t,{start},{start + duration})'"
+    )
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-vf", drawtext,
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "copy",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ]
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Added subtitle: '{text[:20]}...' at {start}s")
+    return output_path
+
+
+def wrap_text(text: str, max_chars: int = 18) -> str:
+    """
+    简单的文本换行处理
+    """
+    if not text: return ""
+    
+    # 如果已经有换行符，假设用户已经手动处理
+    if "\n" in text:
+        return text
+        
+    result = ""
+    count = 0
+    for char in text:
+        if count >= max_chars:
+            result += "\n"
+            count = 0
+        result += char
+        # 简单估算：中文算1个，英文也算1个（等宽字体）
+        # 实际上中英文混合较复杂，这里简化处理
+        count += 1
+    return result
+
+
+def mix_audio_at_offset(
+    base_audio: str,
+    overlay_audio: str,
+    offset: float,
+    output_path: str,
+    base_volume: float = 1.0,
+    overlay_volume: float = 1.0
+) -> str:
+    """
+    在指定偏移位置混合音频
+    """
+    # 如果 base_audio 不存在，创建一个静音底
+    if not os.path.exists(base_audio):
+        logger.warning(f"Base audio not found: {base_audio}")
+        return overlay_audio
+        
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", base_audio,
+        "-i", overlay_audio,
+        "-filter_complex",
+        f"[0:a]volume={base_volume}[a0];[1:a]volume={overlay_volume},adelay={int(offset*1000)}|{int(offset*1000)}[a1];[a0][a1]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[out]",
+        "-map", "[out]",
+        "-c:a", "mp3", # Use MP3 for audio only mixing
+        output_path
+    ]
+    _run_ffmpeg(cmd)
+    return output_path
+
+
+def adjust_audio_duration(
+    input_path: str,
+    target_duration: float,
+    output_path: str
+) -> str:
+    """
+    调整音频时长（仅在音频过长时加速，音频较短时保持原速）
+    
+    用户需求：
+    - 音频时长 > 目标时长 → 加速播放
+    - 音频时长 <= 目标时长 → 保持原速（不慢放）
+    """
+    if not os.path.exists(input_path):
+        return None
+        
+    current_duration = float(get_audio_info(input_path).get("duration", 0))
+    if current_duration <= 0:
+        return input_path
+    
+    # 只在音频过长时才加速，音频较短时保持原速
+    if current_duration <= target_duration:
+        # 音频时长 <= 目标时长，不需要调整，直接复制
+        import shutil
+        shutil.copy(input_path, output_path)
+        logger.info(f"Audio ({current_duration:.2f}s) <= target ({target_duration:.2f}s), keeping original speed")
+        return output_path
+    
+    # 音频过长，需要加速
+    speed_ratio = current_duration / target_duration
+    
+    # 限制加速范围 (最多2倍速)，避免声音变调太严重
+    speed_ratio = min(speed_ratio, 2.0)
+    
+    logger.info(f"Audio ({current_duration:.2f}s) > target ({target_duration:.2f}s), speeding up {speed_ratio:.2f}x")
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", input_path,
+        "-filter:a", f"atempo={speed_ratio}",
+        output_path
+    ]
+    _run_ffmpeg(cmd)
+    return output_path
+
+
+def get_audio_info(file_path: str) -> Dict[str, Any]:
+    """获取音频信息"""
+    return get_video_info(file_path)
+
+
+def wrap_text_smart(text: str, max_chars: int = 15) -> str:
+    """
+    智能字幕换行（上短下长策略）
+    """
+    if not text or len(text) <= max_chars:
+        return text
+        
+    # 优先在标点或空格处换行
+    split_chars = ["，", "。", "！", "？", " ", ",", ".", "!", "?"]
+    best_split = -1
+    
+    # 寻找中间附近的分割点
+    mid = len(text) // 2
+    
+    for i in range(len(text)):
+        if text[i] in split_chars:
+            # 偏好后半部分（上短下长）
+            if abs(i - mid) < abs(best_split - mid):
+                best_split = i
+                
+    if best_split != -1 and best_split < len(text) - 1:
+        return text[:best_split+1] + "\n" + text[best_split+1:]
+        
+    # 强制换行（上短下长）
+    split_idx = int(len(text) * 0.4) # 上面 40%
+    return text[:split_idx] + "\n" + text[split_idx:]
+
+
+def add_multiple_subtitles(
+    video_path: str,
+    subtitles: List[Dict[str, Any]],
+    output_path: str,
+    default_style: Dict[str, Any] = None
+) -> str:
+    """
+    添加多条字幕
+    """
+    if not subtitles:
+        # 无字幕直接复制
+        import shutil
+        shutil.copy(video_path, output_path)
+        return output_path
+    
+    default_style = default_style or {}
+    # 强制使用完整字体（先用项目内 NotoSansSC，如果不存在则回退 Droid）
+    font = "/root/video-flow/assets/fonts/NotoSansSC-Regular.otf"
+    if not (os.path.exists(font) and os.path.getsize(font) > 1024 * 100):  # 至少100KB以上认为有效
+        font = "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf"
+    if not (os.path.exists(font) and os.path.getsize(font) > 1024 * 100):
+        font = _get_font_path()
+        
+    print(f"[SubDebug] Using font for subtitles: {font}", flush=True)
+    
+    # 构建多个 drawtext filter
+    filters = []
+    for sub in subtitles:
+        raw_text = sub.get("text", "")
+        # 打印原始文本的 repr 和 hex，以便排查特殊字符
+        print(f"[SubDebug] Subtitle text repr: {repr(raw_text)}", flush=True)
+        print(f"[SubDebug] Subtitle text hex: {' '.join(hex(ord(c)) for c in raw_text)}", flush=True)
+        
+        text = _sanitize_text(raw_text)
+        # 自动换行
+        text = wrap_text(text)
+        
+        start = sub.get("start", 0)
+        duration = sub.get("duration", 3)
+        style = {**default_style, **sub.get("style", {})}
+        
+        fontsize = style.get("fontsize", 48)
+        fontcolor = style.get("fontcolor", "white")
+        borderw = style.get("borderw", 3)
+        bordercolor = style.get("bordercolor", "black")
+        x = style.get("x", "(w-text_w)/2")
+        y = style.get("y", "h-200")
+        
+        # 默认启用背景框以提高可读性
+        box = style.get("box", 1)
+        boxcolor = style.get("boxcolor", "black@0.5")
+        boxborderw = style.get("boxborderw", 10)
+        
+        # 转义：反斜杠、单引号、冒号、百分号
+        escaped_text = text.replace("\\", "\\\\").replace("'", "\\'").replace(":", "\\:").replace("%", "\\%")
+        
+        drawtext = (
+            f"drawtext=text='{escaped_text}':"
+            f"fontfile='{font}':"
+            f"fontsize={fontsize}:"
+            f"fontcolor={fontcolor}:"
+            f"borderw={borderw}:"
+            f"bordercolor={bordercolor}:"
+            f"box={box}:boxcolor={boxcolor}:boxborderw={boxborderw}:"
+            f"x={x}:y={y}:"
+            f"enable='between(t,{start},{start + duration})'"
+        )
+        filters.append(drawtext)
+    
+    # 用逗号连接多个 filter
+    vf = ",".join(filters)
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-vf", vf,
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "copy",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ]
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Added {len(subtitles)} subtitles")
+    return output_path
+
+
+def overlay_image(
+    video_path: str,
+    image_path: str,
+    output_path: str,
+    position: Tuple[int, int] = None,
+    start: float = 0,
+    duration: float = None,
+    fade_in: float = 0,
+    fade_out: float = 0
+) -> str:
+    """
+    叠加透明PNG图片（花字、水印等）到视频
+    
+    Args:
+        video_path: 输入视频路径
+        image_path: PNG图片路径（支持透明通道）
+        output_path: 输出路径
+        position: (x, y) 位置，None则居中
+        start: 开始时间（秒）
+        duration: 持续时间（秒），None则到视频结束
+        fade_in: 淡入时间（秒）
+        fade_out: 淡出时间（秒）
+    
+    Returns:
+        输出文件路径
+    """
+    # 获取视频信息
+    info = get_video_info(video_path)
+    video_duration = info["duration"]
+    
+    if duration is None:
+        duration = video_duration - start
+    
+    # 位置
+    if position:
+        x, y = position
+        pos_str = f"x={x}:y={y}"
+    else:
+        pos_str = "x=(W-w)/2:y=(H-h)/2"  # 居中
+    
+    # 时间控制
+    enable = f"enable='between(t,{start},{start + duration})'"
+    
+    # 构建 overlay filter
+    overlay_filter = f"overlay={pos_str}:{enable}"
+    
+    # 添加淡入淡出效果
+    if fade_in > 0 or fade_out > 0:
+        fade_filter = []
+        if fade_in > 0:
+            fade_filter.append(f"fade=t=in:st={start}:d={fade_in}:alpha=1")
+        if fade_out > 0:
+            fade_out_start = start + duration - fade_out
+            fade_filter.append(f"fade=t=out:st={fade_out_start}:d={fade_out}:alpha=1")
+        
+        img_filter = ",".join(fade_filter) if fade_filter else ""
+        filter_complex = f"[1:v]{img_filter}[img];[0:v][img]{overlay_filter}[outv]"
+    else:
+        filter_complex = f"[0:v][1:v]{overlay_filter}[outv]"
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-i", image_path,
+        "-filter_complex", filter_complex,
+        "-map", "[outv]",
+        "-map", "0:a?",
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "copy",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ]
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Overlaid image at {position or 'center'}, {start}s-{start+duration}s")
+    return output_path
+
+
+def overlay_multiple_images(
+    video_path: str,
+    images: List[Dict[str, Any]],
+    output_path: str
+) -> str:
+    """
+    叠加多个透明PNG图片
+    
+    Args:
+        video_path: 输入视频路径
+        images: 图片配置列表 [{path, x, y, start, duration}]
+        output_path: 输出路径
+    
+    Returns:
+        输出文件路径
+    """
+    if not images:
+        import shutil
+        shutil.copy(video_path, output_path)
+        return output_path
+    
+    # 构建复杂 filter_complex
+    inputs = ["-i", video_path]
+    for img in images:
+        inputs.extend(["-i", img["path"]])
+    
+    # 链式 overlay
+    filter_parts = []
+    prev_output = "0:v"
+    
+    for i, img in enumerate(images):
+        x = img.get("x", "(W-w)/2")
+        y = img.get("y", "(H-h)/2")
+        start = img.get("start", 0)
+        duration = img.get("duration", 999)
+        
+        enable = f"enable='between(t,{start},{start + duration})'"
+        
+        if i == len(images) - 1:
+            out_label = "outv"
+        else:
+            out_label = f"tmp{i}"
+        
+        filter_parts.append(
+            f"[{prev_output}][{i+1}:v]overlay=x={x}:y={y}:{enable}[{out_label}]"
+        )
+        prev_output = out_label
+    
+    filter_complex = ";".join(filter_parts)
+    
+    cmd = [FFMPEG_PATH, "-y"] + inputs + [
+        "-filter_complex", filter_complex,
+        "-map", "[outv]",
+        "-map", "0:a?",
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "copy",
+        "-pix_fmt", "yuv420p",
+        output_path
+    ]
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Overlaid {len(images)} images")
+    return output_path
+
+
+def mix_audio(
+    video_path: str,
+    audio_path: str,
+    output_path: str,
+    audio_volume: float = 1.0,
+    video_volume: float = 0.1,
+    audio_start: float = 0
+) -> str:
+    """
+    混合音频到视频（旁白、BGM等）
+    
+    Args:
+        video_path: 输入视频路径
+        audio_path: 音频文件路径
+        output_path: 输出路径
+        audio_volume: 新音频音量（0-1）
+        video_volume: 原视频音量（0-1）
+        audio_start: 音频开始时间（秒）
+    
+    Returns:
+        输出文件路径
+    """
+    logger.info(f"Mixing audio: {audio_path}")
+    
+    # 检查视频是否有音频轨道
+    info = get_video_info(video_path)
+    video_duration = info["duration"]
+    
+    # 构建 filter_complex
+    # adelay 用于延迟音频开始时间（毫秒）
+    delay_ms = int(audio_start * 1000)
+    
+    filter_complex = (
+        f"[0:a]volume={video_volume}[va];"
+        f"[1:a]adelay={delay_ms}|{delay_ms},volume={audio_volume}[aa];"
+        f"[va][aa]amix=inputs=2:duration=longest:dropout_transition=0:normalize=0[outa]"
+    )
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-i", audio_path,
+        "-filter_complex", filter_complex,
+        "-map", "0:v",
+        "-map", "[outa]",
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-b:a", "192k",
+        output_path
+    ]
+    
+    try:
+        _run_ffmpeg(cmd)
+    except subprocess.CalledProcessError:
+        # 如果原视频没有音频，直接添加新音频
+        logger.warning("Video has no audio track, adding audio directly")
+        cmd = [
+            FFMPEG_PATH, "-y",
+            "-i", video_path,
+            "-i", audio_path,
+            "-map", "0:v",
+            "-map", "1:a",
+            "-c:v", "copy",
+            "-c:a", "aac",
+            "-b:a", "192k",
+            output_path
+        ]
+        _run_ffmpeg(cmd)
+    
+    logger.info(f"Audio mixed: {output_path}")
+    return output_path
+
+
+def add_bgm(
+    video_path: str,
+    bgm_path: str,
+    output_path: str,
+    bgm_volume: float = 0.06,
+    loop: bool = True,
+    ducking: bool = True,
+    duck_gain_db: float = -6.0,
+    fade_in: float = 1.0,
+    fade_out: float = 1.0
+) -> str:
+    """
+    添加背景音乐（自动循环以匹配视频长度）
+    
+    Args:
+        video_path: 输入视频路径
+        bgm_path: BGM文件路径
+        output_path: 输出路径
+        bgm_volume: BGM音量
+        loop: 是否循环BGM
+    """
+    info = get_video_info(video_path)
+    video_duration = info["duration"]
+    
+    if loop:
+        bgm_chain = (
+            f"[1:a]aloop=-1:size=2e+09,asetpts=N/SR/TB,"
+            f"atrim=0:{video_duration},"
+            f"afade=t=in:st=0:d={fade_in},"
+            f"afade=t=out:st={max(video_duration - fade_out, 0)}:d={fade_out},"
+            f"volume={bgm_volume}[bgm]"
+        )
+    else:
+        bgm_chain = (
+            f"[1:a]"
+            f"afade=t=in:st=0:d={fade_in},"
+            f"afade=t=out:st={max(video_duration - fade_out, 0)}:d={fade_out},"
+            f"volume={bgm_volume}[bgm]"
+        )
+
+    if ducking:
+        # 使用安全参数的 sidechaincompress，避免 unsupported 参数
+        filter_complex = (
+            f"{bgm_chain};"
+            f"[0:a][bgm]sidechaincompress=threshold=0.1:ratio=4:attack=5:release=250:makeup=1:mix=1:level_in=1:level_sc=1[outa]"
+        )
+    else:
+        filter_complex = f"{bgm_chain};[0:a][bgm]amix=inputs=2:duration=first[outa]"
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-stream_loop", "-1" if loop else "0",
+        "-i", bgm_path,
+        "-filter_complex", filter_complex,
+        "-map", "0:v",
+        "-map", "[outa]",
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-b:a", "192k",
+        "-t", str(video_duration),
+        output_path
+    ]
+    
+    try:
+        _run_ffmpeg(cmd)
+    except subprocess.CalledProcessError:
+        # sidechain失败时，回退为 amix（保留原有音频 + 低音量BGM）
+        logger.warning("Sidechain failed, fallback to simple amix for BGM")
+        filter_complex = f"{bgm_chain};[0:a][bgm]amix=inputs=2:duration=first[outa]"
+        cmd = [
+            FFMPEG_PATH, "-y",
+            "-i", video_path,
+            "-stream_loop", "-1" if loop else "0",
+            "-i", bgm_path,
+            "-filter_complex", filter_complex,
+            "-map", "0:v",
+            "-map", "[outa]",
+            "-c:v", "copy",
+            "-c:a", "aac",
+            "-b:a", "192k",
+            "-t", str(video_duration),
+            output_path
+        ]
+        _run_ffmpeg(cmd)
+    
+    logger.info(f"BGM added: {output_path}")
+    return output_path
+
+
+def trim_video(
+    video_path: str,
+    output_path: str,
+    start: float = 0,
+    duration: float = None,
+    end: float = None
+) -> str:
+    """
+    裁剪视频
+    
+    Args:
+        video_path: 输入视频路径
+        output_path: 输出路径
+        start: 开始时间（秒）
+        duration: 持续时间（秒）
+        end: 结束时间（秒），与 duration 二选一
+    """
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-ss", str(start)
+    ]
+    
+    if duration:
+        cmd.extend(["-t", str(duration)])
+    elif end:
+        cmd.extend(["-to", str(end)])
+    
+    cmd.extend([
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "copy",
+        output_path
+    ])
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Trimmed video: {start}s - {end or start + duration}s")
+    return output_path
+
+
+def speed_up_video(
+    video_path: str,
+    output_path: str,
+    speed: float = 1.5
+) -> str:
+    """
+    加速/减速视频
+    
+    Args:
+        video_path: 输入视频路径
+        output_path: 输出路径
+        speed: 速度倍率（>1 加速，<1 减速）
+    """
+    # setpts 控制视频速度，atempo 控制音频速度
+    video_filter = f"setpts={1/speed}*PTS"
+    
+    # atempo 只支持 0.5-2.0，超出需要链式处理
+    if speed > 2.0:
+        audio_filter = "atempo=2.0,atempo=" + str(speed / 2.0)
+    elif speed < 0.5:
+        audio_filter = "atempo=0.5,atempo=" + str(speed / 0.5)
+    else:
+        audio_filter = f"atempo={speed}"
+    
+    cmd = [
+        FFMPEG_PATH, "-y",
+        "-i", video_path,
+        "-vf", video_filter,
+        "-af", audio_filter,
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-crf", "23",
+        "-c:a", "aac",
+        output_path
+    ]
+    
+    _run_ffmpeg(cmd)
+    logger.info(f"Speed changed to {speed}x: {output_path}")
+    return output_path
--- a/modules/image_gen.py
+++ b/modules/image_gen.py
@@ -0,0 +1,491 @@
+"""
+连贯生图模块 (Volcengine Doubao)
+负责根据分镜脚本和原始素材生成一系列连贯的分镜图片
+"""
+import base64
+import logging
+import os
+import time
+import requests
+import json
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from PIL import Image
+import io
+from modules import storage
+
+import config
+
+logger = logging.getLogger(__name__)
+
+class ImageGenerator:
+    """连贯图片生成器 (Volcengine Provider)"""
+    
+    def __init__(self):
+        self.api_key = config.VOLC_API_KEY
+        # Endpoint: https://ark.cn-beijing.volces.com/api/v3/images/generations
+        self.endpoint = f"https://ark.cn-beijing.volces.com/api/v3/images/generations"
+        self.model = config.IMAGE_MODEL_ID
+        
+    def _encode_image(self, image_path: str) -> str:
+        """读取图片，调整大小并转为 Base64"""
+        try:
+            with Image.open(image_path) as img:
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                
+                max_size = 1024
+                if max(img.size) > max_size:
+                    img.thumbnail((max_size, max_size), Image.LANCZOS)
+                
+                buffer = io.BytesIO()
+                img.save(buffer, format="JPEG", quality=80)
+                return base64.b64encode(buffer.getvalue()).decode('utf-8')
+        except Exception as e:
+            logger.error(f"Error processing image {image_path}: {e}")
+            return ""
+
+    def generate_single_scene_image(
+        self,
+        scene: Dict[str, Any],
+        original_image_path: Any, 
+        previous_image_path: Optional[str] = None,
+        model_provider: str = "shubiaobiao", # "shubiaobiao", "gemini", "doubao"
+        visual_anchor: str = ""  # 视觉锚点，强制拼接到 prompt 前
+    ) -> Optional[str]:
+        """
+        生成单张分镜图片 (Public)
+        """
+        scene_id = scene["id"]
+        visual_prompt = scene.get("visual_prompt", "")
+        
+        # 强制拼接 Visual Anchor (确保生图一致性)
+        if visual_anchor and visual_anchor not in visual_prompt:
+            visual_prompt = f"[{visual_anchor}] {visual_prompt}"
+            logger.info(f"Scene {scene_id}: Prepended visual_anchor to prompt")
+        
+        logger.info(f"Generating image for Scene {scene_id} (Provider: {model_provider})...")
+        
+        input_images = []
+        
+        # Handle original_image_path (can be str or list)
+        if isinstance(original_image_path, list):
+             input_images.extend(original_image_path)
+        elif isinstance(original_image_path, str) and original_image_path:
+             input_images.append(original_image_path)
+             
+        if previous_image_path:
+            input_images.append(previous_image_path)
+            
+        try:
+            output_path = self._generate_single_image(
+                prompt=visual_prompt,
+                reference_images=input_images, 
+                output_filename=f"scene_{scene_id}_{int(time.time())}.png",
+                provider=model_provider
+            )
+            
+            if output_path:
+                return output_path
+            else:
+                raise RuntimeError(f"Image generation returned empty for Scene {scene_id}")
+                    
+        except PermissionError as e:
+            logger.error(f"Critical API Error for Scene {scene_id}: {e}")
+            raise e
+        except Exception as e:
+            logger.error(f"Image generation failed for Scene {scene_id}: {e}")
+            raise e
+
+    def generate_group_images_doubao(
+        self,
+        scenes: List[Dict[str, Any]],
+        reference_images: List[str],
+        visual_anchor: str = ""  # 视觉锚点
+    ) -> Dict[int, str]:
+        """
+        Doubao 组图生成 (Batch) - 拼接 Prompt 一次生成多张
+        """
+        logger.info("Starting Doubao Group Image Generation...")
+        
+        # 1. 拼接 Prompts
+        # 格式: "Global: [Visual Anchor] ... | S1: ... | S2: ..."
+        
+        scene_prompts = []
+        for scene in scenes:
+            # 提取分镜 Visual Prompt
+            p = scene.get("visual_prompt", "")
+            scene_prompts.append(f"S{scene['id']}:{p}")
+            
+        combined_scenes_text = " | ".join(scene_prompts)
+        
+        # 构造 Combined Prompt - 将 visual_anchor 放入 Global 部分
+        global_context = f"[{visual_anchor}] Consistent product appearance & style." if visual_anchor else "Consistent product appearance & style."
+        combined_prompt = (
+            f"Global: {global_context}\n"
+            f"{combined_scenes_text}\n"
+            "Req: 1 img per scene. Follow specific angles."
+        )
+        
+        logger.info(f"Visual Anchor applied to group prompt: {visual_anchor[:50]}..." if visual_anchor else "No visual_anchor")
+        
+        # 记录 Prompt 长度供参考
+        logger.info(f"Doubao Group Prompt Length: {len(combined_prompt)} chars")
+
+        # 2. 准备 payload
+        payload = {
+            "model": config.DOUBAO_IMG_MODEL,
+            "prompt": combined_prompt,
+            "sequential_image_generation": "auto", # 开启组图
+            "sequential_image_generation_options": {
+                "max_images": len(scenes) # 限制最大张数
+            },
+            "response_format": "url",
+            "size": "1440x2560",
+            "stream": False,
+            "watermark": False
+        }
+        
+        # 3. 处理参考图
+        img_urls = []
+        if reference_images:
+            for ref_path in reference_images:
+                if os.path.exists(ref_path):
+                    try:
+                        url = storage.upload_file(ref_path)
+                        if url: img_urls.append(url)
+                    except Exception as e:
+                        logger.warning(f"Failed to upload ref image {ref_path}: {e}")
+        
+        if img_urls:
+            payload["image_urls"] = img_urls
+            
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {config.VOLC_API_KEY}"
+        }
+        
+        try:
+            logger.info(f"Submitting Doubao Group Request (Scenes: {len(scenes)})...")
+            resp = requests.post(self.endpoint, json=payload, headers=headers, timeout=240)
+            resp.raise_for_status()
+            
+            data = resp.json()
+            results = {}
+            
+            if "data" in data:
+                items = data["data"]
+                logger.info(f"Doubao returned {len(items)} images.")
+                
+                # 尝试将返回的图片映射回 Scene
+                # 假设顺序一致
+                for i, item in enumerate(items):
+                    if i < len(scenes):
+                        scene_id = scenes[i]["id"]
+                        image_url = item.get("url")
+                        
+                        if image_url:
+                            # Download
+                            img_resp = requests.get(image_url, timeout=60)
+                            output_path = config.TEMP_DIR / f"scene_{scene_id}_{int(time.time())}.png"
+                            with open(output_path, "wb") as f:
+                                f.write(img_resp.content)
+                            results[scene_id] = str(output_path)
+                            
+            return results
+            
+        except Exception as e:
+            logger.error(f"Doubao Group Generation Failed: {e}")
+            raise e
+
+    def _generate_single_image(
+        self, 
+        prompt: str, 
+        reference_images: List[str],
+        output_filename: str,
+        provider: str = "shubiaobiao"
+    ) -> Optional[str]:
+        """统一入口"""
+        if provider == "doubao":
+            return self._generate_single_image_doubao(prompt, reference_images, output_filename)
+        elif provider == "gemini":
+            return self._generate_single_image_gemini(prompt, reference_images, output_filename)
+        else:
+            return self._generate_single_image_shubiao(prompt, reference_images, output_filename)
+
+    def _generate_single_image_doubao(
+        self,
+        prompt: str,
+        reference_images: List[str],
+        output_filename: str
+    ) -> Optional[str]:
+        """调用 Volcengine Doubao (Image API)"""
+        
+        # 1. Upload all reference images to R2
+        img_urls = []
+        if reference_images:
+            for ref_path in reference_images:
+                if os.path.exists(ref_path):
+                    try:
+                        url = storage.upload_file(ref_path)
+                        if url:
+                            img_urls.append(url)
+                            logger.info(f"Uploaded Doubao ref image: {url}")
+                    except Exception as e:
+                        logger.warning(f"Failed to upload Doubao ref image {ref_path}: {e}")
+        
+        payload = {
+            "model": config.DOUBAO_IMG_MODEL, 
+            "prompt": prompt,
+            "sequential_image_generation": "disabled",
+            "response_format": "url",
+            "size": "1440x2560", 
+            "stream": False,
+            "watermark": False
+        }
+        
+        if img_urls:
+            payload["image_urls"] = img_urls
+            logger.info(f"Doubao Image Payload: prompt='{prompt[:20]}...', image_urls={len(img_urls)}")
+        else:
+            logger.info(f"Doubao Image Payload: prompt='{prompt[:20]}...', no reference images")
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {config.VOLC_API_KEY}"
+        }
+        
+        try:
+            logger.info(f"Submitting to Doubao Image: {self.endpoint}")
+            resp = requests.post(self.endpoint, json=payload, headers=headers, timeout=180) 
+            
+            if resp.status_code != 200:
+                msg = f"Doubao Image Failed ({resp.status_code}): {resp.text}"
+                logger.error(msg)
+                raise RuntimeError(msg)
+                
+            data = resp.json()
+            
+            if "data" in data and len(data["data"]) > 0:
+                image_url = data["data"][0].get("url")
+                if image_url:
+                    img_resp = requests.get(image_url, timeout=60)
+                    img_resp.raise_for_status()
+                    
+                    output_path = config.TEMP_DIR / output_filename
+                    with open(output_path, "wb") as f:
+                        f.write(img_resp.content)
+                    return str(output_path)
+            
+            raise RuntimeError(f"No image URL in Doubao response: {data}")
+            
+        except Exception as e:
+            logger.error(f"Doubao Gen Failed: {e}")
+            raise e
+
+    def _generate_single_image_shubiao(
+        self,
+        prompt: str,
+        reference_images: List[str],
+        output_filename: str
+    ) -> Optional[str]:
+        """调用 api2img.shubiaobiao.com 通道生成图片（同步返回 base64）"""
+        # 准备参考图，内联 base64 方式
+        parts = [{"text": prompt}]
+        
+        # 严格过滤和排序参考图
+        valid_refs = []
+        if reference_images:
+            for p in reference_images:
+                if p and os.path.exists(p) and p not in valid_refs:
+                    valid_refs.append(p)
+        
+        logger.info(f"[Shubiaobiao] Input reference images ({len(valid_refs)}): {valid_refs}")
+        
+        if valid_refs:
+            for ref_path in valid_refs:
+                try:
+                    encoded = self._encode_image(ref_path)
+                    if encoded:
+                        parts.append({
+                            "inlineData": {
+                                "mimeType": "image/jpeg",
+                                "data": encoded
+                            }
+                        })
+                except Exception as e:
+                    logger.error(f"Failed to encode image {ref_path}: {e}")
+        
+        logger.info(f"[Shubiaobiao] Final payload parts count: {len(parts)} (1 prompt + {len(parts)-1} images)")
+        
+        payload = {
+            "contents": [{
+                "role": "user",
+                "parts": parts
+            }],
+            "generationConfig": {
+                "responseModalities": ["IMAGE"],
+                "imageConfig": {
+                    "aspectRatio": "9:16",
+                    "imageSize": "2K"
+                }
+            }
+        }
+        
+        endpoint = f"{config.SHUBIAOBIAO_IMG_BASE_URL}/v1beta/models/{config.SHUBIAOBIAO_IMG_MODEL_NAME}:generateContent"
+        headers = {
+            "x-goog-api-key": config.SHUBIAOBIAO_IMG_KEY,
+            "Content-Type": "application/json"
+        }
+        
+        try:
+            logger.info(f"Submitting to Shubiaobiao Img: {endpoint}")
+            resp = requests.post(endpoint, json=payload, headers=headers, timeout=120)
+            
+            if resp.status_code != 200:
+                msg = f"Shubiaobiao 提交失败 ({resp.status_code}): {resp.text}"
+                logger.error(msg)
+                raise RuntimeError(msg)
+                
+            data = resp.json()
+            
+            # 查找 base64 图像
+            img_b64 = None
+            candidates = data.get("candidates") or []
+            if candidates:
+                content_parts = candidates[0].get("content", {}).get("parts", [])
+                for part in content_parts:
+                    inline = part.get("inlineData") if isinstance(part, dict) else None
+                    if inline and inline.get("data"):
+                        img_b64 = inline["data"]
+                        break
+            
+            if not img_b64:
+                msg = f"Shubiaobiao 响应缺少图片数据: {data}"
+                logger.error(msg)
+                raise RuntimeError(msg)
+            
+            output_path = config.TEMP_DIR / output_filename
+            with open(output_path, "wb") as f:
+                f.write(base64.b64decode(img_b64))
+                
+            logger.info(f"Shubiaobiao Generation Success: {output_path}")
+            return str(output_path)
+            
+        except Exception as e:
+            logger.error(f"Shubiaobiao Generation Exception: {e}")
+            raise
+
+    def _generate_single_image_gemini(
+        self,
+        prompt: str,
+        reference_images: List[str],
+        output_filename: str
+    ) -> Optional[str]:
+        """调用 Gemini (Wuyin Keji / NanoBanana-Pro) 生成单张图片"""
+        
+        # 1. 构造 Payload
+        payload = {
+            "prompt": prompt,
+            "aspectRatio": "9:16",
+            "imageSize": "2K"
+        }
+        
+        # 处理参考图 (Image-to-Image)
+        if reference_images:
+            valid_paths = []
+            seen = set()
+            for p in reference_images:
+                if p and os.path.exists(p) and p not in seen:
+                    valid_paths.append(p)
+                    seen.add(p)
+            
+            if valid_paths:
+                img_urls = []
+                for ref_path in valid_paths:
+                    try:
+                        url = storage.upload_file(ref_path)
+                        if url:
+                            img_urls.append(url)
+                            logger.info(f"Uploaded ref image: {url}")
+                    except Exception as e:
+                         logger.warning(f"Error uploading ref image {ref_path}: {e}")
+                
+                if img_urls:
+                    payload["img_url"] = img_urls
+                    logger.info(f"Using {len(img_urls)} reference images for Gemini Img2Img")
+
+        headers = {
+            "Authorization": config.GEMINI_IMG_KEY,
+            "Content-Type": "application/json;charset:utf-8"
+        }
+        
+        # 2. 提交任务
+        try:
+            logger.info(f"Submitting to Gemini: {config.GEMINI_IMG_API_URL}")
+            resp = requests.post(config.GEMINI_IMG_API_URL, json=payload, headers=headers, timeout=30)
+            
+            if resp.status_code != 200:
+                msg = f"Gemini 提交失败 ({resp.status_code}): {resp.text}"
+                logger.error(msg)
+                raise RuntimeError(msg)
+                
+            data = resp.json()
+            if data.get("code") != 200:
+                msg = f"Gemini 返回错误: {data}"
+                logger.error(msg)
+                raise RuntimeError(msg)
+                
+            task_id = data.get("data", {}).get("id")
+            if not task_id:
+                raise RuntimeError(f"Gemini 响应缺少 task id: {data}")
+                
+            logger.info(f"Gemini Task Submitted, ID: {task_id}")
+            
+            # 3. 轮询状态
+            max_retries = 60
+            for i in range(max_retries):
+                time.sleep(2)
+                
+                poll_url = f"{config.GEMINI_IMG_DETAIL_URL}?key={config.GEMINI_IMG_KEY}&id={task_id}"
+                try:
+                    poll_resp = requests.get(poll_url, headers=headers, timeout=30)
+                except requests.Timeout:
+                    continue
+                except Exception as e:
+                    continue
+
+                if poll_resp.status_code != 200:
+                    continue
+                        
+                poll_data = poll_resp.json()
+                if poll_data.get("code") != 200:
+                    raise RuntimeError(f"Gemini 轮询返回错误: {poll_data}")
+
+                result_data = poll_data.get("data", {}) or {}
+                status = result_data.get("status") # 0:排队, 1:生成中, 2:成功, 3:失败
+                
+                if status == 2:
+                    image_url = result_data.get("image_url")
+                    if not image_url:
+                        raise RuntimeError("Gemini 成功但缺少 image_url")
+                        
+                    logger.info(f"Gemini Generation Success: {image_url}")
+                    img_resp = requests.get(image_url, timeout=60)
+                    img_resp.raise_for_status()
+                    
+                    output_path = config.TEMP_DIR / output_filename
+                    with open(output_path, "wb") as f:
+                        f.write(img_resp.content)
+                        
+                    return str(output_path)
+                    
+                if status == 3:
+                    fail_reason = result_data.get("fail_reason", "Unknown")
+                    raise RuntimeError(f"Gemini 生成失败: {fail_reason}")
+
+            raise RuntimeError("Gemini 生成超时")
+            
+        except Exception as e:
+            logger.error(f"Gemini Generation Exception: {e}")
+            raise
--- a/modules/ingest.py
+++ b/modules/ingest.py
@@ -0,0 +1,60 @@
+"""
+MatchMe Studio - Ingest Module (Video Processing)
+"""
+import cv2
+import os
+import logging
+from pathlib import Path
+from typing import List, Tuple
+import config
+from modules import storage
+
+logger = logging.getLogger(__name__)
+
+def process_uploaded_video(video_path: str) -> Tuple[List[str], str]:
+    """
+    Process uploaded video:
+    1. Upload raw video to R2.
+    2. Extract 3 keyframes (10%, 50%, 90%).
+    3. Return local frame paths and R2 video URL.
+    """
+    if not Path(video_path).exists():
+        raise FileNotFoundError(f"Video not found: {video_path}")
+        
+    logger.info(f"Processing video: {video_path}")
+    
+    # 1. Upload to R2
+    video_url = storage.upload_file(video_path)
+    if not video_url:
+        raise RuntimeError("Failed to upload video to R2")
+        
+    # 2. Extract Frames
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video: {video_path}")
+        
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_indices = [
+        int(total_frames * 0.1),
+        int(total_frames * 0.5),
+        int(total_frames * 0.9)
+    ]
+    
+    frame_urls = []
+    for i, idx in enumerate(frame_indices):
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if ret:
+            frame_name = f"frame_{Path(video_path).stem}_{i}.jpg"
+            frame_path = config.TEMP_DIR / frame_name
+            cv2.imwrite(str(frame_path), frame)
+            
+            # Upload frame to R2 immediately
+            frame_url = storage.upload_file(str(frame_path))
+            if frame_url:
+                frame_urls.append(frame_url)
+            
+    cap.release()
+    logger.info(f"Extracted and uploaded {len(frame_urls)} frames")
+    
+    return frame_urls, video_url
--- a/modules/project.py
+++ b/modules/project.py
@@ -0,0 +1,151 @@
+"""
+MatchMe Studio - Project State Management (R2 Persistence)
+"""
+import json
+import logging
+import uuid
+from datetime import datetime
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass, asdict, field
+
+import config
+from modules import storage
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Scene:
+    id: int
+    duration: int = 5
+    timeline: str = ""
+    keyframe: Dict[str, str] = field(default_factory=dict)
+    camera_movement: str = ""
+    story_beat: str = ""
+    voiceover: str = ""
+    rhythm: Dict[str, Any] = field(default_factory=dict)
+    image_url: str = ""
+    video_url: str = ""
+
+
+@dataclass
+class Project:
+    id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    status: str = "draft"  # draft | analyzing | scripting | imaging | video | rendering | done
+    
+    # Step 0: Input
+    input_mode: str = ""  # text | images | video
+    prompt: str = ""
+    image_urls: List[str] = field(default_factory=list)
+    video_url: str = ""
+    asr_text: str = ""
+    
+    # Step 1: Analysis
+    analysis: str = ""
+    questions: List[Dict[str, Any]] = field(default_factory=list)
+    answers: Dict[str, str] = field(default_factory=dict)
+    
+    # Step 2: Script
+    hook: str = ""
+    scenes: List[Dict[str, Any]] = field(default_factory=list)
+    cta: str = ""
+    
+    # Step 6: Final
+    final_video_url: str = ""
+    bgm_url: str = ""
+
+
+def save_project(project: Project) -> str:
+    """Save project state to R2 as JSON."""
+    data = asdict(project)
+    json_str = json.dumps(data, ensure_ascii=False, indent=2)
+    
+    # Write to temp file
+    temp_path = config.TEMP_DIR / f"project_{project.id}.json"
+    with open(temp_path, "w", encoding="utf-8") as f:
+        f.write(json_str)
+    
+    # Upload to R2
+    object_name = f"projects/{project.id}.json"
+    s3 = storage.get_s3_client()
+    
+    try:
+        s3.upload_file(
+            str(temp_path),
+            config.R2_BUCKET_NAME,
+            object_name,
+            ExtraArgs={'ContentType': 'application/json'}
+        )
+        logger.info(f"Project {project.id} saved to R2")
+        return project.id
+    except Exception as e:
+        logger.error(f"Failed to save project: {e}")
+        raise
+
+
+def load_project(project_id: str) -> Optional[Project]:
+    """Load project state from R2."""
+    object_name = f"projects/{project_id}.json"
+    temp_path = config.TEMP_DIR / f"project_{project_id}.json"
+    
+    s3 = storage.get_s3_client()
+    
+    try:
+        s3.download_file(config.R2_BUCKET_NAME, object_name, str(temp_path))
+        
+        with open(temp_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        
+        # Reconstruct Project
+        project = Project(
+            id=data.get("id", project_id),
+            created_at=data.get("created_at", ""),
+            status=data.get("status", "draft"),
+            input_mode=data.get("input_mode", ""),
+            prompt=data.get("prompt", ""),
+            image_urls=data.get("image_urls", []),
+            video_url=data.get("video_url", ""),
+            asr_text=data.get("asr_text", ""),
+            analysis=data.get("analysis", ""),
+            questions=data.get("questions", []),
+            answers=data.get("answers", {}),
+            hook=data.get("hook", ""),
+            scenes=data.get("scenes", []),
+            cta=data.get("cta", ""),
+            final_video_url=data.get("final_video_url", ""),
+            bgm_url=data.get("bgm_url", "")
+        )
+        
+        logger.info(f"Project {project_id} loaded from R2")
+        return project
+        
+    except Exception as e:
+        logger.warning(f"Failed to load project {project_id}: {e}")
+        return None
+
+
+def create_project() -> Project:
+    """Create a new project with unique ID."""
+    project = Project()
+    logger.info(f"Created new project: {project.id}")
+    return project
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/modules/script_gen.py
+++ b/modules/script_gen.py
@@ -0,0 +1,390 @@
+"""
+脚本生成模块 (Gemini-3-Pro)
+负责解析商品信息，生成分镜脚本
+"""
+import base64
+import json
+import logging
+import os
+import requests
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+
+import config
+from modules.db_manager import db
+
+logger = logging.getLogger(__name__)
+
+class ScriptGenerator:
+    """分镜脚本生成器"""
+
+    def __init__(self):
+        self.api_key = config.SHUBIAOBIAO_KEY
+        # 注意：API 地址可能需要适配 gemini-3-pro-preview 的具体路径
+        # 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
+        # 这里我们先假设 base_url 是 v1beta/models/
+        self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"
+        
+        # Default System Prompt
+        self.default_system_prompt = """
+你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片，设计一个高转化率的商品详情页首图视频脚本。
+
+## 目标
+- 提升商品详情页的 GPM 和下单转化率
+- 视频时长 9-12 秒 (由 3-4 个分镜组成)
+- **每个分镜时长固定为 3 秒** (duration: 3)，不要超过 3 秒
+- 必须包含：目标人群分析、卖点提炼、分镜设计
+
+## 分镜设计原则
+1. **单分镜单主体**：每个分镜聚焦一个视觉主体或动作，避免复杂运镜，因为 AI 生视频在长时间（>3秒）容易出现画面异常。
+2. **旁白跨分镜**：一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中，通过 start_time 和 duration (秒) 控制旁白的绝对时间位置，无需与分镜一一对应。
+3. **节奏感**：分镜之间保持视觉连贯，通过景别变化（特写 -> 中景 -> 全景）制造节奏。
+4. **语速控制**：旁白语速约 4 字/秒，12字旁白约需 3 秒。
+
+## 输出格式要求 (JSON)
+必须严格遵守以下 JSON 结构：
+{
+  "product_name": "商品名称",
+  "visual_anchor": "商品视觉锚点：材质+颜色+形状+包装特征（用于保持生图一致性）",
+  "selling_points": ["卖点1", "卖点2"],
+  "target_audience": "目标人群描述",
+  "video_style": "视频风格关键词",
+  "bgm_style": "BGM风格关键词",
+  "voiceover_timeline": [
+    {
+      "id": 1,
+      "text": "旁白文案片段1（可横跨多个分镜）",
+      "subtitle": "字幕文案1 (简短有力)",
+      "start_time": 0.0,
+      "duration": 3.0
+    },
+    {
+      "id": 2,
+      "text": "旁白文案片段2",
+      "subtitle": "字幕文案2",
+      "start_time": 3.5,
+      "duration": 2.5
+    }
+  ],
+  "scenes": [
+    {
+      "id": 1,
+      "duration": 3,
+      "visual_prompt": "详细的画面描述，用于AI生图，包含主体、背景、构图、光影。英文描述。",
+      "video_prompt": "详细的动效描述，用于AI图生视频。英文描述。",
+      "fancy_text": {
+        "text": "花字文案 (最多6字)",
+        "style": "highlight",
+        "position": "center",
+        "start_time": 0.5,
+        "duration": 2.0
+      }
+    }
+  ]
+}
+
+## 注意事项
+1. **visual_prompt**: 
+   - 必须是英文。
+   - 描述要具体，例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
+   - **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo（这是商品真实外观的一部分）。
+   - 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
+   - **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
+2. **video_prompt**: 必须是英文，描述动作，例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单，避免复杂运镜和人体动作。
+3. **voiceover_timeline**:
+   - 这是整个视频的旁白和字幕时间轴，独立于分镜。
+   - `start_time` 是旁白开始的绝对时间 (秒)，`duration` 是旁白持续时长 (秒)。
+   - **一段旁白可以横跨多个分镜**，例如：总时长 9 秒 (3 个分镜)，一段旁白从 start_time=0，duration=5，则覆盖前两个分镜。
+   - 两段旁白之间留 0.3-0.5 秒间隙（气口）。
+4. **fancy_text**: 
+   - 花字要精简（最多 6 字），突出卖点。
+   - **Style Selection**:
+     - `highlight`: 默认样式，适合通用卖点 (Yellow/Black)。
+     - `warning`: 强调痛点或食欲 (Red/White)。
+     - `price`: 价格显示 (Big Red)。
+     - `bubble`: 旁白补充或用户评价 (Bubble)。
+     - `minimal`: 高级感，适合时尚类 (Thin/White)。
+     - `tech`: 数码类 (Cyan/Glow)。
+   - `position` 默认 `center`，可选 top/bottom/top-left/bottom-right 等。
+5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
+"""
+
+    def _encode_image(self, image_path: str) -> str:
+        """读取图片并转为 Base64"""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def generate_script(
+        self, 
+        product_name: str, 
+        product_info: Dict[str, Any], 
+        image_paths: List[str] = None,
+        model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
+    ) -> Dict[str, Any]:
+        """
+        生成分镜脚本
+        """
+        logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")
+        
+        # 1. 构造 Prompt (优先从数据库读取配置)
+        system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
+        user_prompt = self._build_user_prompt(product_name, product_info)
+        
+        # Branch for Doubao
+        if model_provider == "doubao":
+            return self._generate_script_doubao(system_prompt, user_prompt, image_paths)
+
+        # ... Existing Shubiaobiao Logic ...
+        
+        # 调试: 检查是否使用了自定义 Prompt
+        if system_prompt != self.default_system_prompt:
+            logger.info("Using CUSTOM system prompt from database")
+        else:
+            logger.info("Using DEFAULT system prompt")
+        
+        # 2. 构造请求 Payload (Gemini/Shubiaobiao)
+        contents = []
+        
+        # User message parts
+        user_parts = [{"text": user_prompt}]
+        
+        # 添加图片 (Multimodal input)
+        if image_paths:
+            for path in image_paths[:10]: # 限制10张，Gemini-3-Pro 支持多图
+                if Path(path).exists():
+                    try:
+                        b64_img = self._encode_image(path)
+                        user_parts.append({
+                            "inline_data": {
+                                "mime_type": "image/jpeg", # 假设是 JPG/PNG
+                                "data": b64_img
+                            }
+                        })
+                    except Exception as e:
+                        logger.warning(f"Failed to encode image {path}: {e}")
+
+        contents.append({
+            "role": "user",
+            "parts": user_parts
+        })
+        
+        # System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
+        user_parts.insert(0, {"text": system_prompt})
+
+        payload = {
+            "contents": contents,
+            "generationConfig": {
+                "response_mime_type": "application/json",
+                "temperature": 0.7
+            }
+        }
+
+        headers = {
+            "x-goog-api-key": self.api_key,
+            "Content-Type": "application/json"
+        }
+
+        # 3. 调用 API
+        try:
+            response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            
+            result = response.json()
+            
+            # 4. 解析结果
+            if "candidates" in result and result["candidates"]:
+                content_text = result["candidates"][0]["content"]["parts"][0]["text"]
+                
+                # 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
+                script_json = self._extract_json_from_response(content_text)
+                
+                if script_json is None:
+                    logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
+                    return None
+                    
+                final_script = self._validate_and_fix_script(script_json)
+                
+                # Add Debug Info (包含原始输出)
+                final_script["_debug"] = {
+                    "system_prompt": system_prompt,
+                    "user_prompt": user_prompt,
+                    "raw_output": content_text,
+                    "provider": "shubiaobiao"
+                }
+                return final_script
+            else:
+                logger.error(f"No candidates in response: {result}")
+                return None
+                
+        except Exception as e:
+            logger.error(f"Script generation failed: {e}")
+            if 'response' in locals():
+                logger.error(f"Response content: {response.text}")
+            return None
+
+    def _generate_script_doubao(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_paths: List[str]
+    ) -> Dict[str, Any]:
+        """Doubao 脚本生成实现 (Multimodal)"""
+        # User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
+        # But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
+        # User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]
+        
+        endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
+        # However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
+        # If that fails or if user insists on the specific structure, we can adapt.
+        # Volcengine 'ep-...' models are usually served via standard /chat/completions.
+        
+        # Let's try standard OpenAI format which Doubao supports perfectly.
+        
+        messages = [
+            {"role": "system", "content": system_prompt}
+        ]
+        
+        user_content = []
+        
+        # Add Images (Doubao Vision supports image_url)
+        if image_paths:
+            for path in image_paths[:5]: # Limit
+                if os.path.exists(path):
+                    # For Volcengine, need to upload or use base64? 
+                    # Standard OpenAI format supports base64 data urls.
+                    # "image_url": {"url": "data:image/jpeg;base64,..."}
+                    try:
+                        b64_img = self._encode_image(path)
+                        user_content.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{b64_img}"
+                            }
+                        })
+                    except Exception as e:
+                        logger.warning(f"Failed to encode image for Doubao: {e}")
+        
+        # Add Text
+        user_content.append({"type": "text", "text": user_prompt})
+        
+        messages.append({
+            "role": "user",
+            "content": user_content
+        })
+        
+        payload = {
+            "model": config.DOUBAO_SCRIPT_MODEL,
+            "messages": messages,
+            "stream": False,
+            # "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {config.VOLC_API_KEY}",
+            "Content-Type": "application/json"
+        }
+        
+        try:
+            # Try standard chat/completions first
+            resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)
+            
+            if resp.status_code != 200:
+                # If 404, maybe endpoint is wrong, try the user's 'responses' endpoint? 
+                # But 'responses' usually implies a different payload structure.
+                logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
+                # Fallback to user provided structure if needed (implement later if this fails)
+                resp.raise_for_status()
+                
+            result = resp.json()
+            content_text = result["choices"][0]["message"]["content"]
+            
+            script_json = self._extract_json_from_response(content_text)
+            
+            if script_json is None:
+                logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
+                return None
+                
+            final_script = self._validate_and_fix_script(script_json)
+            final_script["_debug"] = {
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "raw_output": content_text,
+                "provider": "doubao"
+            }
+            return final_script
+            
+        except Exception as e:
+            logger.error(f"Doubao script generation failed: {e}")
+            if 'resp' in locals():
+                logger.error(f"Response: {resp.text}")
+            return None
+
+    def _extract_json_from_response(self, text: str) -> Optional[Dict]:
+        """
+        从 API 响应中提取 JSON 对象
+        支持：
+        1. 纯 JSON 响应
+        2. Markdown 代码块包裹的 JSON (```json ... ```)
+        3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
+        """
+        import re
+        
+        # 方法1: 尝试直接解析（纯 JSON 情况）
+        try:
+            return json.loads(text.strip())
+        except json.JSONDecodeError:
+            pass
+        
+        # 方法2: 提取 ```json ... ``` 代码块
+        json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
+        if json_block_match:
+            try:
+                return json.loads(json_block_match.group(1))
+            except json.JSONDecodeError as e:
+                logger.warning(f"JSON block found but parse failed: {e}")
+        
+        # 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
+        code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
+        if code_block_match:
+            try:
+                return json.loads(code_block_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        
+        # 方法4: 找到第一个 { 和最后一个 } 之间的内容
+        first_brace = text.find('{')
+        last_brace = text.rfind('}')
+        if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
+            try:
+                return json.loads(text[first_brace:last_brace + 1])
+            except json.JSONDecodeError as e:
+                logger.warning(f"Brace extraction failed: {e}")
+        
+        return None
+
+    def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
+        # 提取商家偏好提示
+        style_hint = product_info.get("style_hint", "")
+        # 过滤掉不需要展示的字段
+        filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
+        info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])
+        
+        prompt = f"""
+商品名称：{product_name}
+商品信息：
+{info_str}
+"""
+        if style_hint:
+            prompt += f"""
+## 商家特别要求
+{style_hint}
+"""
+        prompt += "\n请根据以上信息设计视频脚本。"
+        return prompt
+
+    def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
+        """校验并修复脚本结构"""
+        # 简单校验，确保必要字段存在
+        if "scenes" not in script:
+            script["scenes"] = []
+        return script
--- a/modules/storage.py
+++ b/modules/storage.py
@@ -0,0 +1,84 @@
+"""
+MatchMe Studio - Storage Module (R2)
+"""
+import os
+import logging
+import time
+import uuid
+import boto3
+from botocore.exceptions import NoCredentialsError
+from pathlib import Path
+from typing import Optional
+
+import config
+
+logger = logging.getLogger(__name__)
+
+def get_s3_client():
+    try:
+        return boto3.client(
+            's3',
+            endpoint_url=config.R2_ENDPOINT,
+            aws_access_key_id=config.R2_ACCESS_KEY,
+            aws_secret_access_key=config.R2_SECRET_KEY,
+            region_name='auto'
+        )
+    except Exception as e:
+        logger.error(f"Failed to create R2 client: {e}")
+        raise
+
+def upload_file(file_path: str) -> Optional[str]:
+    """Upload file to R2 and return Public URL."""
+    if not os.path.exists(file_path):
+        logger.error(f"File not found: {file_path}")
+        return None
+    
+    # 使用 UUID 作为文件名，避免中文/特殊字符导致的 URL 问题
+    original_name = Path(file_path).name
+    ext = Path(file_path).suffix.lower() or ".bin"
+    object_name = f"{uuid.uuid4().hex}{ext}"
+    
+    s3 = get_s3_client()
+    
+    try:
+        logger.info(f"Uploading {original_name} to R2 as {object_name}...")
+        
+        # 根据后缀设置正确的 Content-Type
+        if ext == ".png":
+            content_type = "image/png"
+        elif ext in [".jpg", ".jpeg"]:
+            content_type = "image/jpeg"
+        elif ext == ".mp4":
+            content_type = "video/mp4"
+        elif ext == ".mp3":
+            content_type = "audio/mpeg"
+        else:
+            content_type = "application/octet-stream"
+        
+        s3.upload_file(
+            file_path, 
+            config.R2_BUCKET_NAME, 
+            object_name,
+            ExtraArgs={'ContentType': content_type}
+        )
+        
+        public_url = f"{config.R2_PUBLIC_URL}/{object_name}"
+        logger.info(f"Upload successful: {public_url}")
+        return public_url
+        
+    except Exception as e:
+        logger.error(f"R2 Upload Failed: {e}")
+        return None
+
+def cleanup_temp(max_age_seconds: int = 3600):
+    """Delete old temp files."""
+    logger.info("Running cleanup_temp...")
+    now = time.time()
+    if not config.TEMP_DIR.exists(): return
+
+    for f in config.TEMP_DIR.iterdir():
+        try:
+            if f.is_file() and (now - f.stat().st_mtime) > max_age_seconds:
+                f.unlink()
+        except Exception as e:
+            logger.warning(f"Failed to delete {f}: {e}")
--- a/modules/styles.py
+++ b/modules/styles.py
@@ -0,0 +1,76 @@
+"""
+花字样式预设库
+供 Design Agent 和 Renderer 使用
+"""
+
+STYLES = {
+    # 1. 醒目强调 (黄色高亮)
+    "highlight": {
+        "font_size": 60,
+        "font_color": "#FFE66D", # 亮黄
+        "stroke": {"color": "#000000", "width": 4},
+        "shadow": {"color": "#000000", "blur": 8, "offset": [4, 4], "opacity": 0.6}
+    },
+    
+    # 2. 警告/痛点 (红色/黑色背景)
+    "warning": {
+        "font_size": 55,
+        "font_color": "#FFFFFF",
+        "stroke": {"color": "#FF0000", "width": 0}, # 无描边
+        "background": {
+            "type": "box",
+            "color": "#FF4D4F", # 红色背景
+            "corner_radius": 12,
+            "padding": [15, 25, 15, 25] # t, r, b, l
+        },
+        "shadow": {"color": "#990000", "blur": 0, "offset": [0, 6], "opacity": 0.4} # 立体感阴影
+    },
+    
+    # 3. 价格/促销 (大号红色)
+    "price": {
+        "font_size": 90,
+        "font_color": "#FF2E2E", # 鲜红
+        "stroke": {"color": "#FFFFFF", "width": 6}, # 白边
+        "shadow": {"color": "#FF9999", "blur": 15, "offset": [0, 0], "opacity": 0.8} # 发光效果
+    },
+    
+    # 4. 对话/气泡 (黑字白底圆角)
+    "bubble": {
+        "font_size": 48,
+        "font_color": "#333333",
+        "background": {
+            "type": "box",
+            "color": "#FFFFFF",
+            "corner_radius": 40, # 大圆角
+            "padding": [20, 40, 20, 40]
+        },
+        "shadow": {"color": "#000000", "blur": 10, "offset": [2, 5], "opacity": 0.2}
+    },
+    
+    # 5. 时尚/极简 (细黑体+白字)
+    "minimal": {
+        "font_size": 65,
+        "font_color": "#FFFFFF",
+        "stroke": {"color": "#000000", "width": 2},
+        "shadow": {"color": "#000000", "blur": 2, "offset": [2, 2], "opacity": 0.8},
+        "font_family": "NotoSansSC-Regular.otf" # 假设有这个字体，或者回退
+    },
+    
+    # 6. 科技/未来 (青色+发光)
+    "tech": {
+        "font_size": 60,
+        "font_color": "#00FFFF",
+        "stroke": {"color": "#003333", "width": 3},
+        "shadow": {"color": "#00FFFF", "blur": 20, "offset": [0, 0], "opacity": 0.9}
+    }
+}
+
+def get_style(style_name: str) -> dict:
+    """获取样式配置，支持回退"""
+    return STYLES.get(style_name, STYLES["highlight"])
+
+
+
+
+
+
--- a/modules/text_renderer.py
+++ b/modules/text_renderer.py
@@ -0,0 +1,251 @@
+"""
+通用文本渲染引擎
+支持原子化设计参数，供上游 Design Agent 灵活调用
+"""
+import os
+import hashlib
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Tuple, Union, Optional
+
+from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageColor
+
+import config
+from modules.styles import get_style
+
+logger = logging.getLogger(__name__)
+
+# 缓存目录
+CACHE_DIR = config.TEMP_DIR / "text_renderer_cache"
+CACHE_DIR.mkdir(exist_ok=True)
+
+
+class TextRenderer:
+    """
+    通用文本渲染器
+    基于原子化参数渲染文本图片 (PNG)
+    """
+    
+    def __init__(self):
+        self.default_font_path = self._resolve_font_path(None)
+
+    def _resolve_font_path(self, font_family: Optional[str]) -> str:
+        """解析字体路径，支持多级回退"""
+        candidates = []
+        if font_family:
+            # 1. 尝试作为绝对路径
+            candidates.append(font_family)
+            # 2. 尝试在 assets/fonts 下查找
+            candidates.append(str(config.FONTS_DIR / font_family))
+            if not font_family.endswith(".ttf") and not font_family.endswith(".otf"):
+                 candidates.append(str(config.FONTS_DIR / f"{font_family}.ttf"))
+                 candidates.append(str(config.FONTS_DIR / f"{font_family}.otf"))
+
+        # 3. 预设项目字体
+        candidates.extend([
+            str(config.FONTS_DIR / "SmileySans-Oblique.ttf"),
+            str(config.FONTS_DIR / "AlibabaPuHuiTi-Bold.ttf"),
+            str(config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf"),
+            str(config.FONTS_DIR / "NotoSansSC-Bold.otf"), # 假如有效
+        ])
+        
+        # 4. 系统字体回退
+        candidates.extend([
+            "/System/Library/Fonts/PingFang.ttc",
+            "/System/Library/Fonts/STHeiti Medium.ttc",
+            "C:/Windows/Fonts/msyh.ttc",
+            "C:/Windows/Fonts/simhei.ttf",
+        ])
+
+        for path in candidates:
+            if path and os.path.exists(path):
+                # 简单验证文件大小
+                try:
+                    if os.path.getsize(path) > 10000:
+                        return path
+                except:
+                    continue
+        
+        logger.warning("No valid font found, using default load_default()")
+        return None
+
+    def _get_font(self, font_path: str, size: int) -> ImageFont.FreeTypeFont:
+        try:
+            if font_path:
+                return ImageFont.truetype(font_path, size)
+        except Exception as e:
+            logger.warning(f"Failed to load font {font_path}: {e}")
+        return ImageFont.load_default()
+
+    def _parse_color(self, color: Union[str, Tuple]) -> Tuple[int, int, int, int]:
+        """解析颜色为 RGBA"""
+        if isinstance(color, str):
+            if color.startswith("#"):
+                rgb = ImageColor.getrgb(color)
+                return rgb + (255,)
+            # TODO: 支持 'rgba(r,g,b,a)' 格式
+        if isinstance(color, tuple):
+            if len(color) == 3:
+                return color + (255,)
+            return color
+        return (0, 0, 0, 255)
+
+    def render(self, text: str, style: Union[Dict[str, Any], str], cache: bool = True) -> str:
+        """
+        渲染文本并返回图片路径
+        
+        style 结构:
+        {
+            "font_family": str,
+            "font_size": int,
+            "font_color": str,
+            "stroke": [{"color": str, "width": int}, ...],
+            "shadow": {"color": str, "blur": int, "offset": [x, y], "opacity": float},
+            "background": {
+                "type": "box", "color": str/list, "corner_radius": int, "padding": [t, r, b, l]
+            }
+        }
+        """
+        # 0. 解析样式
+        if isinstance(style, str):
+            style = get_style(style)
+            
+        # 1. 缓存检查
+        cache_key = hashlib.md5(f"{text}_{str(style)}".encode()).hexdigest()
+        if cache:
+            cache_path = CACHE_DIR / f"{cache_key}.png"
+            if cache_path.exists():
+                return str(cache_path)
+        
+        # 2. 解析基本参数
+        font_path = self._resolve_font_path(style.get("font_family"))
+        font_size = style.get("font_size", 60)
+        font = self._get_font(font_path, font_size)
+        font_color = self._parse_color(style.get("font_color", "#FFFFFF"))
+        
+        # 3. 测量文本尺寸
+        dummy_draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))
+        bbox = dummy_draw.textbbox((0, 0), text, font=font)
+        text_w = bbox[2] - bbox[0]
+        text_h = bbox[3] - bbox[1]
+        
+        # 4. 计算总尺寸 (包含 padding, stroke, shadow)
+        strokes = style.get("stroke", [])
+        if isinstance(strokes, dict): strokes = [strokes] # 兼容旧格式
+        
+        max_stroke = 0
+        for s in strokes:
+            max_stroke = max(max_stroke, s.get("width", 0))
+            
+        shadow = style.get("shadow", {})
+        shadow_blur = shadow.get("blur", 0)
+        shadow_offset = shadow.get("offset", [0, 0])
+        
+        bg = style.get("background", {})
+        padding = bg.get("padding", [0, 0, 0, 0]) 
+        if isinstance(padding, int): padding = [padding] * 4
+        if len(padding) == 2: padding = [padding[0], padding[1], padding[0], padding[1]] # v, h -> t, r, b, l
+        
+        # 内容区域尺寸 (文本 + padding)
+        content_w = text_w + padding[1] + padding[3]
+        content_h = text_h + padding[0] + padding[2]
+        
+        # 扩展区域 (描边 + 阴影)
+        extra_margin = max_stroke + shadow_blur + max(abs(shadow_offset[0]), abs(shadow_offset[1])) + 10
+        
+        canvas_w = content_w + extra_margin * 2
+        canvas_h = content_h + extra_margin * 2
+        
+        # 5. 创建画布
+        img = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
+        draw = ImageDraw.Draw(img)
+        
+        # 锚点位置 (文本中心点)
+        center_x = canvas_w // 2
+        center_y = canvas_h // 2
+        
+        # 6. 绘制顺序: 阴影 -> 背景 -> 描边 -> 文本
+        
+        # --- 绘制阴影 (针对整个块) ---
+        if shadow:
+            shadow_color = self._parse_color(shadow.get("color", "#000000"))
+            opacity = shadow.get("opacity", 0.5)
+            shadow_color = (shadow_color[0], shadow_color[1], shadow_color[2], int(255 * opacity))
+            
+            # 临时画布绘制形状用于生成阴影
+            shadow_layer = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
+            shadow_draw = ImageDraw.Draw(shadow_layer)
+            
+            # 如果有背景，阴影跟随背景形状；否则跟随文字
+            if bg and bg.get("type") != "none":
+                self._draw_background(shadow_draw, bg, center_x, center_y, content_w, content_h, shadow_color)
+            else:
+                # 文字阴影
+                txt_x = center_x - text_w / 2
+                txt_y = center_y - text_h / 2
+                shadow_draw.text((txt_x, txt_y), text, font=font, fill=shadow_color)
+                # 描边阴影
+                for s in strokes:
+                    width = s.get("width", 0)
+                    # 简单模拟描边阴影：多次绘制
+                    # (略: 完整描边阴影开销大，暂只做文字阴影)
+            
+            # 应用模糊
+            if shadow_blur > 0:
+                shadow_layer = shadow_layer.filter(ImageFilter.GaussianBlur(shadow_blur))
+            
+            # 应用偏移
+            final_shadow = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
+            final_shadow.paste(shadow_layer, (int(shadow_offset[0]), int(shadow_offset[1])), mask=shadow_layer)
+            
+            img = Image.alpha_composite(final_shadow, img)
+            draw = ImageDraw.Draw(img) # 重置 draw
+            
+        # --- 绘制背景 ---
+        if bg and bg.get("type") in ["box", "circle"]:
+            bg_color = self._parse_color(bg.get("color", "#000000"))
+            # TODO: 支持渐变背景
+            self._draw_background(draw, bg, center_x, center_y, content_w, content_h, bg_color)
+            
+        # --- 绘制描边 (仅针对文字) ---
+        # 从外向内绘制
+        txt_x = center_x - text_w / 2
+        txt_y = center_y - text_h / 2
+        
+        for s in reversed(strokes):
+            color = self._parse_color(s.get("color", "#000000"))
+            width = s.get("width", 0)
+            if width > 0:
+                # 通过偏移模拟描边 (Pillow stroke_width 效果一般，但这里先用原生参数)
+                draw.text((txt_x, txt_y), text, font=font, fill=color, stroke_width=width, stroke_fill=color)
+        
+        # --- 绘制文字 ---
+        draw.text((txt_x, txt_y), text, font=font, fill=font_color)
+        
+        # 7. 裁剪多余透明区域
+        bbox = img.getbbox()
+        if bbox:
+            img = img.crop(bbox)
+            
+        # 8. 保存
+        output_path = str(CACHE_DIR / f"{cache_key}.png")
+        img.save(output_path)
+        logger.info(f"Rendered text: {text} -> {output_path}")
+        
+        return output_path
+
+    def _draw_background(self, draw, bg, cx, cy, w, h, color):
+        """绘制背景形状"""
+        corner_radius = bg.get("corner_radius", 0)
+        x0 = cx - w / 2
+        y0 = cy - h / 2
+        x1 = cx + w / 2
+        y1 = cy + h / 2
+        
+        if bg.get("type") == "box":
+            draw.rounded_rectangle([x0, y0, x1, y1], radius=corner_radius, fill=color)
+        elif bg.get("type") == "circle":
+            draw.ellipse([x0, y0, x1, y1], fill=color)
+
+# 全局单例
+renderer = TextRenderer()
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -0,0 +1,177 @@
+"""
+Gloda Video Factory - Utility Functions
+Handles font management, Auto-QC, and helper effects.
+"""
+
+import os
+import logging
+from pathlib import Path
+from typing import Optional, Tuple
+import urllib.request
+import math
+
+import numpy as np
+from PIL import Image
+from moviepy.editor import ImageClip, VideoFileClip, AudioFileClip
+
+import config
+
+logger = logging.getLogger(__name__)
+
+# Google Fonts CDN URL
+ROBOTO_BOLD_URL = "https://github.com/googlefonts/roboto/raw/main/src/hinted/Roboto-Bold.ttf"
+NOTO_SC_BOLD_URL = "https://raw.githubusercontent.com/google/fonts/main/ofl/notosanssc/NotoSansSC-Bold.ttf"
+
+FONT_PATH_EN = config.FONTS_DIR / "Roboto-Bold.ttf"
+FONT_PATH_CN = config.FONTS_DIR / "NotoSansSC-Bold.ttf"
+
+
+def ensure_fonts() -> Path:
+    """Ensure required fonts (EN & CN) are available."""
+    config.FONTS_DIR.mkdir(parents=True, exist_ok=True)
+    
+    # English Font
+    if not FONT_PATH_EN.exists():
+        logger.info(f"Downloading Roboto-Bold font...")
+        try:
+            urllib.request.urlretrieve(ROBOTO_BOLD_URL, FONT_PATH_EN)
+        except Exception as e:
+            logger.error(f"Failed to download EN font: {e}")
+
+    # Chinese Font
+    if not FONT_PATH_CN.exists():
+        logger.info(f"Downloading NotoSansSC-Bold font...")
+        try:
+            # Using a reliable mirror or source if Github raw is flaky, but trying Github first
+            urllib.request.urlretrieve(NOTO_SC_BOLD_URL, FONT_PATH_CN)
+        except Exception as e:
+            logger.error(f"Failed to download CN font: {e}")
+    
+    # Return CN font as default for mixed text
+    if FONT_PATH_CN.exists():
+        return FONT_PATH_CN
+    return FONT_PATH_EN
+
+
+def check_imagemagick() -> bool:
+    """Check if ImageMagick is installed."""
+    import shutil
+    if shutil.which("convert"):
+        return True
+    else:
+        logger.warning("ImageMagick not found. Text overlays may fail.")
+        return False
+
+
+def verify_assets(video_path: str, audio_path: str) -> Tuple[bool, str]:
+    """
+    Auto-QC: Verify generated assets quality.
+    
+    Checks:
+    1. File size sanity check
+    2. Duration matching (+/- 2s tolerance)
+    3. Audio silence check
+    
+    Returns:
+        (Passed: bool, Reason: str)
+    """
+    logger.info(f"Running Auto-QC on:\nVideo: {video_path}\nAudio: {audio_path}")
+    
+    try:
+        # 1. File Size Check
+        vid_size = os.path.getsize(video_path)
+        if vid_size < 50 * 1024:  # < 50KB
+            return False, f"Video file too small ({vid_size/1024:.1f}KB). Likely error/black screen."
+            
+        aud_size = os.path.getsize(audio_path)
+        if aud_size < 5 * 1024:  # < 5KB
+            return False, f"Audio file too small ({aud_size/1024:.1f}KB)."
+
+        # 2. Duration Check
+        try:
+            v_clip = VideoFileClip(video_path)
+            a_clip = AudioFileClip(audio_path)
+            
+            v_dur = v_clip.duration
+            a_dur = a_clip.duration
+            
+            # Check for silence (RMS)
+            # Read first 2 seconds of audio
+            chunk = a_clip.to_soundarray(fps=44100, nbytes=2, buffersize=1000)
+            if chunk is not None:
+                rms = np.sqrt(np.mean(chunk**2))
+                if rms < 0.001:
+                    v_clip.close()
+                    a_clip.close()
+                    return False, "Audio appears to be silent (RMS < 0.001)"
+            
+            v_clip.close()
+            a_clip.close()
+            
+            # Tolerance check
+            if abs(v_dur - a_dur) > 2.0:
+                return False, f"Duration mismatch: Video={v_dur:.1f}s, Audio={a_dur:.1f}s"
+                
+        except Exception as e:
+            return False, f"Media analysis failed: {str(e)}"
+
+        return True, "QC Passed"
+        
+    except Exception as e:
+        logger.error(f"Auto-QC Error: {e}")
+        return False, f"QC System Error: {e}"
+
+
+def apply_ken_burns(
+    image_path: str,
+    duration: float = 5.0,
+    zoom_ratio: float = 1.2,
+    output_path: Optional[str] = None
+) -> str:
+    """Apply Ken Burns effect (slow zoom in) to a static image."""
+    if output_path is None:
+        base_name = Path(image_path).stem
+        output_path = str(config.OUTPUT_DIR / f"{base_name}_ken_burns.mp4")
+    
+    logger.info(f"Applying Ken Burns effect to {image_path}")
+    
+    img = Image.open(image_path)
+    img_width, img_height = img.size
+    target_width = config.VIDEO_SETTINGS["width"]
+    target_height = config.VIDEO_SETTINGS["height"]
+    fps = config.VIDEO_SETTINGS["fps"]
+    
+    scale_w = (target_width * zoom_ratio) / img_width
+    scale_h = (target_height * zoom_ratio) / img_height
+    base_scale = max(scale_w, scale_h)
+    
+    new_width = int(img_width * base_scale)
+    new_height = int(img_height * base_scale)
+    img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    img_array = np.array(img_resized)
+    
+    def make_frame(t):
+        progress = t / duration
+        eased_progress = 0.5 - 0.5 * np.cos(np.pi * progress)
+        current_zoom = 1 + (zoom_ratio - 1) * eased_progress
+        
+        crop_width = int(target_width / current_zoom * (new_width / target_width))
+        crop_height = int(target_height / current_zoom * (new_height / target_height))
+        
+        crop_width = min(crop_width, new_width)
+        crop_height = min(crop_height, new_height)
+        
+        x_start = (new_width - crop_width) // 2
+        y_start = (new_height - crop_height) // 2
+        
+        cropped = img_array[y_start:y_start + crop_height, x_start:x_start + crop_width]
+        cropped_pil = Image.fromarray(cropped)
+        resized = cropped_pil.resize((target_width, target_height), Image.Resampling.LANCZOS)
+        return np.array(resized)
+    
+    clip = ImageClip(make_frame, duration=duration)
+    clip = clip.set_fps(fps)
+    clip.write_videofile(output_path, fps=fps, codec=config.VIDEO_SETTINGS["codec"], audio=False, logger=None)
+    clip.close()
+    
+    return output_path
--- a/modules/video_gen.py
+++ b/modules/video_gen.py
@@ -0,0 +1,269 @@
+"""
+图生视频模块 (Volcengine Doubao-SeedDance)
+负责将分镜图片转换为视频片段
+"""
+import logging
+import time
+import requests
+import os
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+
+import config
+from modules import storage
+from modules.db_manager import db
+
+logger = logging.getLogger(__name__)
+
+class VideoGenerator:
+    """图生视频生成器"""
+
+    def __init__(self):
+        self.api_key = config.VOLC_API_KEY
+        self.base_url = config.VOLC_BASE_URL
+        self.model_id = config.VIDEO_MODEL_ID
+        
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+
+    def submit_scene_video_task(
+        self, 
+        project_id: str,
+        scene_id: int, 
+        image_path: str, 
+        prompt: str
+    ) -> str:
+        """
+        提交单场景视频生成任务
+        Returns: task_id or None
+        """
+        if not image_path or not os.path.exists(image_path):
+            logger.warning(f"Skipping video generation for Scene {scene_id}: Image not found")
+            return None
+            
+        # 上传图片到 R2 获取 URL
+        logger.info(f"Uploading image for Scene {scene_id}...")
+        image_url = storage.upload_file(image_path)
+        
+        if not image_url:
+            logger.error(f"Failed to upload image for Scene {scene_id}")
+            return None
+            
+        logger.info(f"Submitting video task for Scene {scene_id}...")
+        task_id = self._submit_task(image_url, prompt)
+        
+        if task_id:
+            # 立即保存 task_id 到数据库，状态为 processing
+            db.save_asset(
+                project_id=project_id,
+                scene_id=scene_id,
+                asset_type="video",
+                status="processing",
+                task_id=task_id,
+                local_path=None
+            )
+            
+        return task_id
+
+    def recover_video_from_task(self, task_id: str, output_path: str) -> bool:
+        """
+        尝试从已有的 task_id 恢复视频 (查询状态并下载)
+        """
+        try:
+            status, video_url = self._check_task(task_id)
+            logger.info(f"Recovering task {task_id}: status={status}")
+            
+            if status == "succeeded" and video_url:
+                downloaded_path = self._download_video(video_url, os.path.basename(output_path))
+                if downloaded_path:
+                    # 如果下载的文件名和目标路径不一致 (download_video 使用 filename 参数拼接到 TEMP_DIR)，
+                    # 需要移动或确认。 _download_video 返回完整路径。
+                    # 如果 output_path 是绝对路径且不同，则移动。
+                    if os.path.abspath(downloaded_path) != os.path.abspath(output_path):
+                         import shutil
+                         shutil.move(downloaded_path, output_path)
+                    return True
+            return False
+        except Exception as e:
+            logger.error(f"Failed to recover video task {task_id}: {e}")
+            return False
+
+    def check_task_status(self, task_id: str) -> tuple[str, str]:
+        """
+        查询任务状态
+        Returns: (status, video_url)
+        """
+        return self._check_task(task_id)
+
+    def generate_scene_videos(
+        self, 
+        project_id: str,
+        script: Dict[str, Any], 
+        scene_images: Dict[int, str]
+    ) -> Dict[int, str]:
+        """
+        批量生成分镜视频 (Legacy: 阻塞式轮询)
+        """
+        generated_videos = {}
+        tasks = {} # scene_id -> task_id
+        
+        scenes = script.get("scenes", [])
+        
+        # 1. 提交所有任务
+        for scene in scenes:
+            scene_id = scene["id"]
+            image_path = scene_images.get(scene_id)
+            prompt = scene.get("video_prompt", "High quality video")
+            
+            # Use new method signature with project_id
+            task_id = self.submit_scene_video_task(project_id, scene_id, image_path, prompt)
+            
+            if task_id:
+                tasks[scene_id] = task_id
+                logger.info(f"Task submitted: {task_id}")
+            else:
+                logger.error(f"Failed to submit task for Scene {scene_id}")
+
+        # 2. 轮询任务状态
+        pending_tasks = list(tasks.keys())
+        
+        # 设置最大轮询时间 (例如 10 分钟)
+        start_time = time.time()
+        timeout = 600 
+        
+        while pending_tasks and (time.time() - start_time < timeout):
+            logger.info(f"Polling status for {len(pending_tasks)} tasks...")
+            
+            still_pending = []
+            for scene_id in pending_tasks:
+                task_id = tasks[scene_id]
+                status, result_url = self._check_task(task_id)
+                
+                if status == "succeeded":
+                    logger.info(f"Scene {scene_id} video generated successfully")
+                    # 下载视频
+                    video_path = self._download_video(result_url, f"scene_{scene_id}_video.mp4")
+                    if video_path:
+                        generated_videos[scene_id] = video_path
+                        # Update DB
+                        db.save_asset(
+                            project_id=project_id,
+                            scene_id=scene_id,
+                            asset_type="video",
+                            status="completed",
+                            local_path=video_path,
+                            task_id=task_id
+                        )
+                elif status == "failed" or status == "cancelled":
+                    logger.error(f"Scene {scene_id} task failed/cancelled")
+                    db.save_asset(
+                        project_id=project_id,
+                        scene_id=scene_id,
+                        asset_type="video",
+                        status="failed",
+                        task_id=task_id
+                    )
+                else:
+                    # running, queued
+                    still_pending.append(scene_id)
+            
+            pending_tasks = still_pending
+            if pending_tasks:
+                time.sleep(5) # 间隔 5 秒
+                
+        return generated_videos
+
+    def _submit_task(self, image_url: str, prompt: str) -> str:
+        """提交生成任务"""
+        url = f"{self.base_url}/contents/generations/tasks"
+        
+        payload = {
+            "model": self.model_id,
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"{prompt} --resolution 1080p --duration 3 --camerafixed false --watermark false" 
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url}
+                }
+            ]
+        }
+        
+        try:
+            response = requests.post(url, headers=self.headers, json=payload, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            # ID might be at top level or in data object depending on exact API version response
+            # Document says: { "id": "...", "status": "..." } or similar
+            task_id = data.get("id")
+            if not task_id and "data" in data:
+                 task_id = data.get("data", {}).get("id")
+                 
+            return task_id
+        except Exception as e:
+            logger.error(f"Task submission failed: {e}")
+            if 'response' in locals():
+                logger.error(f"Response: {response.text}")
+            return None
+
+    def _check_task(self, task_id: str) -> tuple[str, str]:
+        """
+        检查任务状态
+        Returns: (status, content_url)
+        Status: queued, running, succeeded, failed, cancelled
+        """
+        url = f"{self.base_url}/contents/generations/tasks/{task_id}"
+        
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            
+            # API Response structure: 
+            # { "id": "...", "status": "succeeded", "content": [ { "url": "...", "video_url": "..." } ] }
+            # Or nested in "data" key
+            
+            result = data
+            if "data" in data and "status" not in data: # Check if wrapped in data
+                result = data["data"]
+                
+            status = result.get("status")
+            content_url = None
+            
+            if status == "succeeded":
+                if "content" in result:
+                    content = result["content"]
+                    if isinstance(content, list) and len(content) > 0:
+                         item = content[0]
+                         content_url = item.get("video_url") or item.get("url")
+                    elif isinstance(content, dict):
+                         content_url = content.get("video_url") or content.get("url")
+            
+            return status, content_url
+            
+        except Exception as e:
+            logger.error(f"Check task failed: {e}")
+            return "unknown", None
+
+    def _download_video(self, url: str, filename: str) -> str:
+        """下载视频到临时目录"""
+        if not url:
+            return None
+            
+        try:
+            response = requests.get(url, stream=True, timeout=60)
+            response.raise_for_status()
+            
+            output_path = config.TEMP_DIR / filename
+            with open(output_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            
+            return str(output_path)
+        except Exception as e:
+            logger.error(f"Download video failed: {e}")
+            return None