feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions
--- a/modules/script_gen.py
+++ b/modules/script_gen.py
@@ -0,0 +1,390 @@
+"""
+脚本生成模块 (Gemini-3-Pro)
+负责解析商品信息，生成分镜脚本
+"""
+import base64
+import json
+import logging
+import os
+import requests
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+
+import config
+from modules.db_manager import db
+
+logger = logging.getLogger(__name__)
+
+class ScriptGenerator:
+    """分镜脚本生成器"""
+
+    def __init__(self):
+        self.api_key = config.SHUBIAOBIAO_KEY
+        # 注意：API 地址可能需要适配 gemini-3-pro-preview 的具体路径
+        # 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
+        # 这里我们先假设 base_url 是 v1beta/models/
+        self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"
+        
+        # Default System Prompt
+        self.default_system_prompt = """
+你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片，设计一个高转化率的商品详情页首图视频脚本。
+
+## 目标
+- 提升商品详情页的 GPM 和下单转化率
+- 视频时长 9-12 秒 (由 3-4 个分镜组成)
+- **每个分镜时长固定为 3 秒** (duration: 3)，不要超过 3 秒
+- 必须包含：目标人群分析、卖点提炼、分镜设计
+
+## 分镜设计原则
+1. **单分镜单主体**：每个分镜聚焦一个视觉主体或动作，避免复杂运镜，因为 AI 生视频在长时间（>3秒）容易出现画面异常。
+2. **旁白跨分镜**：一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中，通过 start_time 和 duration (秒) 控制旁白的绝对时间位置，无需与分镜一一对应。
+3. **节奏感**：分镜之间保持视觉连贯，通过景别变化（特写 -> 中景 -> 全景）制造节奏。
+4. **语速控制**：旁白语速约 4 字/秒，12字旁白约需 3 秒。
+
+## 输出格式要求 (JSON)
+必须严格遵守以下 JSON 结构：
+{
+  "product_name": "商品名称",
+  "visual_anchor": "商品视觉锚点：材质+颜色+形状+包装特征（用于保持生图一致性）",
+  "selling_points": ["卖点1", "卖点2"],
+  "target_audience": "目标人群描述",
+  "video_style": "视频风格关键词",
+  "bgm_style": "BGM风格关键词",
+  "voiceover_timeline": [
+    {
+      "id": 1,
+      "text": "旁白文案片段1（可横跨多个分镜）",
+      "subtitle": "字幕文案1 (简短有力)",
+      "start_time": 0.0,
+      "duration": 3.0
+    },
+    {
+      "id": 2,
+      "text": "旁白文案片段2",
+      "subtitle": "字幕文案2",
+      "start_time": 3.5,
+      "duration": 2.5
+    }
+  ],
+  "scenes": [
+    {
+      "id": 1,
+      "duration": 3,
+      "visual_prompt": "详细的画面描述，用于AI生图，包含主体、背景、构图、光影。英文描述。",
+      "video_prompt": "详细的动效描述，用于AI图生视频。英文描述。",
+      "fancy_text": {
+        "text": "花字文案 (最多6字)",
+        "style": "highlight",
+        "position": "center",
+        "start_time": 0.5,
+        "duration": 2.0
+      }
+    }
+  ]
+}
+
+## 注意事项
+1. **visual_prompt**: 
+   - 必须是英文。
+   - 描述要具体，例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
+   - **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo（这是商品真实外观的一部分）。
+   - 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
+   - **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
+2. **video_prompt**: 必须是英文，描述动作，例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单，避免复杂运镜和人体动作。
+3. **voiceover_timeline**:
+   - 这是整个视频的旁白和字幕时间轴，独立于分镜。
+   - `start_time` 是旁白开始的绝对时间 (秒)，`duration` 是旁白持续时长 (秒)。
+   - **一段旁白可以横跨多个分镜**，例如：总时长 9 秒 (3 个分镜)，一段旁白从 start_time=0，duration=5，则覆盖前两个分镜。
+   - 两段旁白之间留 0.3-0.5 秒间隙（气口）。
+4. **fancy_text**: 
+   - 花字要精简（最多 6 字），突出卖点。
+   - **Style Selection**:
+     - `highlight`: 默认样式，适合通用卖点 (Yellow/Black)。
+     - `warning`: 强调痛点或食欲 (Red/White)。
+     - `price`: 价格显示 (Big Red)。
+     - `bubble`: 旁白补充或用户评价 (Bubble)。
+     - `minimal`: 高级感，适合时尚类 (Thin/White)。
+     - `tech`: 数码类 (Cyan/Glow)。
+   - `position` 默认 `center`，可选 top/bottom/top-left/bottom-right 等。
+5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
+"""
+
+    def _encode_image(self, image_path: str) -> str:
+        """读取图片并转为 Base64"""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def generate_script(
+        self, 
+        product_name: str, 
+        product_info: Dict[str, Any], 
+        image_paths: List[str] = None,
+        model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
+    ) -> Dict[str, Any]:
+        """
+        生成分镜脚本
+        """
+        logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")
+        
+        # 1. 构造 Prompt (优先从数据库读取配置)
+        system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
+        user_prompt = self._build_user_prompt(product_name, product_info)
+        
+        # Branch for Doubao
+        if model_provider == "doubao":
+            return self._generate_script_doubao(system_prompt, user_prompt, image_paths)
+
+        # ... Existing Shubiaobiao Logic ...
+        
+        # 调试: 检查是否使用了自定义 Prompt
+        if system_prompt != self.default_system_prompt:
+            logger.info("Using CUSTOM system prompt from database")
+        else:
+            logger.info("Using DEFAULT system prompt")
+        
+        # 2. 构造请求 Payload (Gemini/Shubiaobiao)
+        contents = []
+        
+        # User message parts
+        user_parts = [{"text": user_prompt}]
+        
+        # 添加图片 (Multimodal input)
+        if image_paths:
+            for path in image_paths[:10]: # 限制10张，Gemini-3-Pro 支持多图
+                if Path(path).exists():
+                    try:
+                        b64_img = self._encode_image(path)
+                        user_parts.append({
+                            "inline_data": {
+                                "mime_type": "image/jpeg", # 假设是 JPG/PNG
+                                "data": b64_img
+                            }
+                        })
+                    except Exception as e:
+                        logger.warning(f"Failed to encode image {path}: {e}")
+
+        contents.append({
+            "role": "user",
+            "parts": user_parts
+        })
+        
+        # System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
+        user_parts.insert(0, {"text": system_prompt})
+
+        payload = {
+            "contents": contents,
+            "generationConfig": {
+                "response_mime_type": "application/json",
+                "temperature": 0.7
+            }
+        }
+
+        headers = {
+            "x-goog-api-key": self.api_key,
+            "Content-Type": "application/json"
+        }
+
+        # 3. 调用 API
+        try:
+            response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            
+            result = response.json()
+            
+            # 4. 解析结果
+            if "candidates" in result and result["candidates"]:
+                content_text = result["candidates"][0]["content"]["parts"][0]["text"]
+                
+                # 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
+                script_json = self._extract_json_from_response(content_text)
+                
+                if script_json is None:
+                    logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
+                    return None
+                    
+                final_script = self._validate_and_fix_script(script_json)
+                
+                # Add Debug Info (包含原始输出)
+                final_script["_debug"] = {
+                    "system_prompt": system_prompt,
+                    "user_prompt": user_prompt,
+                    "raw_output": content_text,
+                    "provider": "shubiaobiao"
+                }
+                return final_script
+            else:
+                logger.error(f"No candidates in response: {result}")
+                return None
+                
+        except Exception as e:
+            logger.error(f"Script generation failed: {e}")
+            if 'response' in locals():
+                logger.error(f"Response content: {response.text}")
+            return None
+
+    def _generate_script_doubao(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_paths: List[str]
+    ) -> Dict[str, Any]:
+        """Doubao 脚本生成实现 (Multimodal)"""
+        # User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
+        # But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
+        # User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]
+        
+        endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
+        # However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
+        # If that fails or if user insists on the specific structure, we can adapt.
+        # Volcengine 'ep-...' models are usually served via standard /chat/completions.
+        
+        # Let's try standard OpenAI format which Doubao supports perfectly.
+        
+        messages = [
+            {"role": "system", "content": system_prompt}
+        ]
+        
+        user_content = []
+        
+        # Add Images (Doubao Vision supports image_url)
+        if image_paths:
+            for path in image_paths[:5]: # Limit
+                if os.path.exists(path):
+                    # For Volcengine, need to upload or use base64? 
+                    # Standard OpenAI format supports base64 data urls.
+                    # "image_url": {"url": "data:image/jpeg;base64,..."}
+                    try:
+                        b64_img = self._encode_image(path)
+                        user_content.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{b64_img}"
+                            }
+                        })
+                    except Exception as e:
+                        logger.warning(f"Failed to encode image for Doubao: {e}")
+        
+        # Add Text
+        user_content.append({"type": "text", "text": user_prompt})
+        
+        messages.append({
+            "role": "user",
+            "content": user_content
+        })
+        
+        payload = {
+            "model": config.DOUBAO_SCRIPT_MODEL,
+            "messages": messages,
+            "stream": False,
+            # "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {config.VOLC_API_KEY}",
+            "Content-Type": "application/json"
+        }
+        
+        try:
+            # Try standard chat/completions first
+            resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)
+            
+            if resp.status_code != 200:
+                # If 404, maybe endpoint is wrong, try the user's 'responses' endpoint? 
+                # But 'responses' usually implies a different payload structure.
+                logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
+                # Fallback to user provided structure if needed (implement later if this fails)
+                resp.raise_for_status()
+                
+            result = resp.json()
+            content_text = result["choices"][0]["message"]["content"]
+            
+            script_json = self._extract_json_from_response(content_text)
+            
+            if script_json is None:
+                logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
+                return None
+                
+            final_script = self._validate_and_fix_script(script_json)
+            final_script["_debug"] = {
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "raw_output": content_text,
+                "provider": "doubao"
+            }
+            return final_script
+            
+        except Exception as e:
+            logger.error(f"Doubao script generation failed: {e}")
+            if 'resp' in locals():
+                logger.error(f"Response: {resp.text}")
+            return None
+
+    def _extract_json_from_response(self, text: str) -> Optional[Dict]:
+        """
+        从 API 响应中提取 JSON 对象
+        支持：
+        1. 纯 JSON 响应
+        2. Markdown 代码块包裹的 JSON (```json ... ```)
+        3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
+        """
+        import re
+        
+        # 方法1: 尝试直接解析（纯 JSON 情况）
+        try:
+            return json.loads(text.strip())
+        except json.JSONDecodeError:
+            pass
+        
+        # 方法2: 提取 ```json ... ``` 代码块
+        json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
+        if json_block_match:
+            try:
+                return json.loads(json_block_match.group(1))
+            except json.JSONDecodeError as e:
+                logger.warning(f"JSON block found but parse failed: {e}")
+        
+        # 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
+        code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
+        if code_block_match:
+            try:
+                return json.loads(code_block_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        
+        # 方法4: 找到第一个 { 和最后一个 } 之间的内容
+        first_brace = text.find('{')
+        last_brace = text.rfind('}')
+        if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
+            try:
+                return json.loads(text[first_brace:last_brace + 1])
+            except json.JSONDecodeError as e:
+                logger.warning(f"Brace extraction failed: {e}")
+        
+        return None
+
+    def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
+        # 提取商家偏好提示
+        style_hint = product_info.get("style_hint", "")
+        # 过滤掉不需要展示的字段
+        filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
+        info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])
+        
+        prompt = f"""
+商品名称：{product_name}
+商品信息：
+{info_str}
+"""
+        if style_hint:
+            prompt += f"""
+## 商家特别要求
+{style_hint}
+"""
+        prompt += "\n请根据以上信息设计视频脚本。"
+        return prompt
+
+    def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
+        """校验并修复脚本结构"""
+        # 简单校验，确保必要字段存在
+        if "scenes" not in script:
+            script["scenes"] = []
+        return script