feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions
--- a/modules/brain.py
+++ b/modules/brain.py
@@ -0,0 +1,346 @@
+"""
+MatchMe Studio - Brain Module (Multi-stage Analysis & Script Generation)
+"""
+import json
+import logging
+from typing import Dict, Any, List, Optional
+from openai import OpenAI
+
+import config
+
+logger = logging.getLogger(__name__)
+
+# Use Volcengine (Doubao) via OpenAI Compatible Interface
+client = OpenAI(
+    api_key=config.VOLC_API_KEY,
+    base_url=config.VOLC_BASE_URL
+)
+
+# ============================================================
+# Stage 1: Analyze Materials
+# ============================================================
+
+ANALYZE_SYSTEM_PROMPT = """你是一位资深短视频创作总监，专精TikTok/抖音爆款内容。
+
+任务：深度分析用户提供的素材和需求，识别产品特性、使用场景、目标人群。
+
+分析维度：
+1. 产品/服务核心卖点（从素材中提取视觉特征）
+2. 视觉风格特征（颜色、质感、包装）
+3. 潜在目标受众
+4. 内容调性建议
+
+然后检查是否缺少关键信息，如果缺少，生成2-5个问题帮助完善需求。
+每个问题必须与短视频创作直接相关。
+
+输出严格JSON格式：
+{
+  "analysis": "详细分析结果，包括从素材中识别到的视觉元素...",
+  "detected_info": {
+    "product": "识别到的产品名称和类型",
+    "visual_features": ["视觉特征1", "视觉特征2"],
+    "audience": "推测的目标人群",
+    "style": "推测的风格"
+  },
+  "missing_info": ["缺少的信息1", "缺少的信息2"],
+  "questions": [
+    {
+      "id": "q1",
+      "text": "问题文字（说明为什么这个问题重要）",
+      "options": ["选项A", "选项B", "选项C"],
+      "allow_multiple": true,
+      "allow_custom": true
+    }
+  ],
+  "ready": false
+}
+
+如果信息足够，ready=true，questions为空数组。
+"""
+
+def analyze_materials(
+    prompt: str,
+    image_urls: List[str] = None,
+    asr_text: str = ""
+) -> Dict[str, Any]:
+    """
+    Deep analysis of user materials.
+    Returns analysis text and questions if info is missing.
+    """
+    logger.info("Brain: Analyzing materials...")
+    
+    # Using Vision Model format (Doubao Vision)
+    # Input format: messages with content list (text + image_url)
+    
+    content_parts = [{"type": "text", "text": f"用户需求: {prompt}"}]
+    
+    if asr_text:
+        content_parts.append({"type": "text", "text": f"\n视频原声(ASR转写): {asr_text}"})
+    
+    if image_urls:
+        content_parts.append({"type": "text", "text": "\n用户上传的素材图片（请仔细分析这些图片中的产品特征）:"})
+        for url in image_urls:
+            content_parts.append({
+                "type": "image_url",
+                "image_url": {"url": url}
+            })
+
+    messages = [
+        # Note: Some vision models might not support 'system' role with images well, 
+        # but Doubao usually follows standard chat structure.
+        # If system prompt fails, prepend it to user content.
+        {"role": "system", "content": ANALYZE_SYSTEM_PROMPT},
+        {"role": "user", "content": content_parts}
+    ]
+
+    try:
+        # Use Vision Model for Analysis
+        response = client.chat.completions.create(
+            model=config.VISION_MODEL_ID, 
+            messages=messages,
+            temperature=0.7,
+            max_tokens=4000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Analyze Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 2: Refine Brief with Answers
+# ============================================================
+
+REFINE_SYSTEM_PROMPT = """你是短视频创作总监。
+根据原始需求、AI分析结果、用户补充回答，整合为完整的创意简报。
+
+注意：用户选择的风格偏好（如ASMR、剧情、视觉流等）必须作为核心创作方向贯穿整个简报。
+
+输出JSON:
+{
+  "brief": {
+    "product": "产品名称",
+    "product_visual_description": "产品视觉描述（颜色、形状、包装、质感等，用于后续图片生成）",
+    "selling_points": ["卖点1", "卖点2"],
+    "target_audience": "目标人群",
+    "platform": "投放平台",
+    "style": "视频风格（必须明确，如ASMR/剧情/视觉流等）",
+    "style_requirements": "该风格的具体创作要求（如ASMR需要：开盖声、质感特写、无人脸等）",
+    "creativity_level": "创意程度",
+    "reference": "对标账号/竞品",
+    "user_assets_description": "用户上传素材的描述（用于后续继承）"
+  },
+  "creative_summary": "整体创意概述（50字以内，描述这个视频的核心创意方向）",
+  "ready": true
+}
+"""
+
+def refine_brief(
+    original_prompt: str,
+    analysis: Dict[str, Any],
+    answers: Dict[str, Any],
+    image_urls: List[str] = None
+) -> Dict[str, Any]:
+    """
+    Integrate user answers into a complete creative brief.
+    """
+    logger.info("Brain: Refining brief with answers...")
+    
+    user_content = f"""
+原始需求: {original_prompt}
+
+AI分析结果: {json.dumps(analysis, ensure_ascii=False)}
+
+用户补充回答: {json.dumps(answers, ensure_ascii=False)}
+
+用户上传的素材URL: {json.dumps(image_urls or [], ensure_ascii=False)}
+"""
+
+    try:
+        # Use Text LLM for reasoning/refining if no new images involved
+        # But to keep it simple, we can stick to BRAIN_MODEL_ID (Doubao Pro)
+        response = client.chat.completions.create(
+            model=config.BRAIN_MODEL_ID,
+            messages=[
+                {"role": "system", "content": REFINE_SYSTEM_PROMPT},
+                {"role": "user", "content": user_content}
+            ],
+            temperature=0.5,
+            max_tokens=3000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Refine Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 3: Generate Script
+# ============================================================
+
+SCRIPT_SYSTEM_PROMPT = """你是顶级短视频编导，专精{style}风格内容创作。
+
+根据创意简报生成爆款脚本。必须严格遵循用户选择的风格要求。
+
+脚本结构要求：
+1. creative_summary: 整体创意概述（这条视频的核心创意是什么）
+2. hook: 前3秒钩子设计（必须抓眼球，符合{style}风格）
+3. scenes: 3-8个分镜
+4. cta: 结尾行动号召（纯文本字符串）
+
+每个分镜(scene)必须包含：
+- id: 分镜编号
+- duration: 时长(5/10/15秒，符合视频模型参数)
+- timeline: 时间轴 (如 "0:00-0:05")
+- image_prompt: 【关键】用于AI生图的详细英文prompt，必须包含：
+    * 产品的具体视觉描述（继承自brief中的product_visual_description）
+    * 8k, hyper-realistic, cinematic lighting
+    * 色调、环境、构图、焦点
+    * 风格要求（如ASMR需要：macro shot, satisfying texture, no human face）
+- keyframe: {
+    "color_tone": "色调",
+    "environment": "环境/背景",
+    "foreground": "前景元素",
+    "focus": "视觉焦点",
+    "subject": "主体描述",
+    "composition": "构图方式"
+  }
+- camera_movement: 运镜描述（如：slow zoom in, pan left, static）
+- story_beat: 这个分镜在整体故事中的作用
+- voiceover: 旁白文字（{style}风格，如ASMR应简短或无旁白，用音效代替）
+- sound_design: 音效设计（如：开盖声、水滴声、环境白噪音）
+- rhythm: {"change": "保持/加快/放慢", "multiplier": 1.0}
+
+旁白要求：
+- 必须连贯，形成完整的叙事
+- 符合{style}风格（ASMR风格应极简或无旁白）
+- 每句旁白要能独立成句，但连起来是完整故事
+
+输出严格JSON格式。
+"""
+
+def generate_script(
+    brief: Dict[str, Any],
+    image_urls: List[str] = None,
+    regenerate_feedback: str = ""
+) -> Dict[str, Any]:
+    """
+    Generate complete video script with scenes.
+    """
+    logger.info("Brain: Generating script...")
+    
+    style = brief.get("style", "现代广告")
+    system_prompt = SCRIPT_SYSTEM_PROMPT.replace("{style}", style)
+    
+    content_parts = [{"type": "text", "text": f"创意简报: {json.dumps(brief, ensure_ascii=False)}"}]
+    
+    if regenerate_feedback:
+        content_parts.append({"type": "text", "text": f"\n用户反馈(请据此调整): {regenerate_feedback}"})
+    
+    if image_urls:
+        content_parts.append({"type": "text", "text": "\n用户上传的参考素材（生成的image_prompt必须参考这些素材中的产品外观）:"})
+        for url in image_urls:
+            content_parts.append({
+                "type": "image_url",
+                "image_url": {"url": url}
+            })
+
+    try:
+        response = client.chat.completions.create(
+            model=config.VISION_MODEL_ID, # Use Vision model to see reference images if available
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": content_parts}
+            ],
+            temperature=0.8,
+            max_tokens=8000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Script Error: {e}")
+        raise
+
+
+# ============================================================
+# Stage 4: Regenerate Single Scene
+# ============================================================
+
+def regenerate_scene(
+    full_script: Dict[str, Any],
+    scene_id: int,
+    feedback: str,
+    brief: Dict[str, Any] = None
+) -> Dict[str, Any]:
+    """
+    Regenerate a single scene based on feedback.
+    """
+    logger.info(f"Brain: Regenerating scene {scene_id}...")
+    
+    style = brief.get("style", "现代广告") if brief else "现代广告"
+    
+    system_prompt = f"""你是短视频编导，专精{style}风格。根据用户反馈重新生成指定分镜。
+保持与其他分镜的风格连贯性。
+image_prompt必须继承产品的视觉描述。
+只输出新的scene对象(JSON)。
+"""
+
+    user_content = f"""
+完整脚本: {json.dumps(full_script, ensure_ascii=False)}
+
+创意简报: {json.dumps(brief, ensure_ascii=False) if brief else "无"}
+
+需要重新生成的分镜ID: {scene_id}
+
+用户反馈: {feedback}
+"""
+
+    try:
+        response = client.chat.completions.create(
+            model=config.BRAIN_MODEL_ID,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_content}
+            ],
+            temperature=0.8,
+            max_tokens=2000
+        )
+        
+        content = response.choices[0].message.content.strip()
+        if content.startswith("```"):
+            parts = content.split("```")
+            if len(parts) > 1:
+                content = parts[1]
+            if content.startswith("json"): content = content[4:]
+            
+        return json.loads(content)
+        
+    except Exception as e:
+        logger.error(f"Brain Regenerate Scene Error: {e}")
+        raise