video-flow/modules/script_gen.py

"""
脚本生成模块 (Gemini-3-Pro)
负责解析商品信息，生成分镜脚本
"""
import base64
import json
import logging
import os
import requests
from typing import Dict, Any, List, Optional
from pathlib import Path

import config
from modules.db_manager import db

logger = logging.getLogger(__name__)

class ScriptGenerator:
    """分镜脚本生成器"""

    def __init__(self):
        self.api_key = config.SHUBIAOBIAO_KEY
        # 注意：API 地址可能需要适配 gemini-3-pro-preview 的具体路径
        # 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
        # 这里我们先假设 base_url 是 v1beta/models/
        self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"

        # Default System Prompt
        self.default_system_prompt = """
你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片，设计一个高转化率的商品详情页首图视频脚本。

## 目标
- 提升商品详情页的 GPM 和下单转化率
- 视频时长 9-12 秒 (由 3-4 个分镜组成)
- **每个分镜时长固定为 3 秒** (duration: 3)，不要超过 3 秒
- 必须包含：目标人群分析、卖点提炼、分镜设计

## 分镜设计原则
1. **单分镜单主体**：每个分镜聚焦一个视觉主体或动作，避免复杂运镜，因为 AI 生视频在长时间（>3秒）容易出现画面异常。
2. **旁白跨分镜**：一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中，通过 start_time 和 duration (秒) 控制旁白的绝对时间位置，无需与分镜一一对应。
3. **节奏感**：分镜之间保持视觉连贯，通过景别变化（特写 -> 中景 -> 全景）制造节奏。
4. **语速控制**：旁白语速约 4 字/秒，12字旁白约需 3 秒。

## 输出格式要求 (JSON)
必须严格遵守以下 JSON 结构：
{
  "product_name": "商品名称",
  "visual_anchor": "商品视觉锚点：材质+颜色+形状+包装特征（用于保持生图一致性）",
  "selling_points": ["卖点1", "卖点2"],
  "target_audience": "目标人群描述",
  "video_style": "视频风格关键词",
  "bgm_style": "BGM风格关键词",
  "voiceover_timeline": [
    {
      "id": 1,
      "text": "旁白文案片段1（可横跨多个分镜）",
      "subtitle": "字幕文案1 (简短有力)",
      "start_time": 0.0,
      "duration": 3.0
    },
    {
      "id": 2,
      "text": "旁白文案片段2",
      "subtitle": "字幕文案2",
      "start_time": 3.5,
      "duration": 2.5
    }
  ],
  "scenes": [
    {
      "id": 1,
      "duration": 3,
      "visual_prompt": "详细的画面描述，用于AI生图，包含主体、背景、构图、光影。英文描述。",
      "video_prompt": "详细的动效描述，用于AI图生视频。英文描述。",
      "fancy_text": {
        "text": "花字文案 (最多6字)",
        "style": "highlight",
        "position": "center",
        "start_time": 0.5,
        "duration": 2.0
      }
    }
  ]
}

## 注意事项
1. **visual_prompt**:
   - 必须是英文。
   - 描述要具体，例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
   - **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo（这是商品真实外观的一部分）。
   - 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
   - **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
2. **video_prompt**: 必须是英文，描述动作，例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单，避免复杂运镜和人体动作。
3. **voiceover_timeline**:
   - 这是整个视频的旁白和字幕时间轴，独立于分镜。
   - `start_time` 是旁白开始的绝对时间 (秒)，`duration` 是旁白持续时长 (秒)。
   - **一段旁白可以横跨多个分镜**，例如：总时长 9 秒 (3 个分镜)，一段旁白从 start_time=0，duration=5，则覆盖前两个分镜。
   - 两段旁白之间留 0.3-0.5 秒间隙（气口）。
4. **fancy_text**:
   - 花字要精简（最多 6 字），突出卖点。
   - **Style Selection**:
     - `highlight`: 默认样式，适合通用卖点 (Yellow/Black)。
     - `warning`: 强调痛点或食欲 (Red/White)。
     - `price`: 价格显示 (Big Red)。
     - `bubble`: 旁白补充或用户评价 (Bubble)。
     - `minimal`: 高级感，适合时尚类 (Thin/White)。
     - `tech`: 数码类 (Cyan/Glow)。
   - `position` 默认 `center`，可选 top/bottom/top-left/bottom-right 等。
5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
"""

    def _encode_image(self, image_path: str) -> str:
        """读取图片并转为 Base64"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def generate_script(
        self,
        product_name: str,
        product_info: Dict[str, Any],
        image_paths: List[str] = None,
        model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
    ) -> Dict[str, Any]:
        """
        生成分镜脚本
        """
        logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")

        # 1. 构造 Prompt (优先从数据库读取配置)
        system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
        user_prompt = self._build_user_prompt(product_name, product_info)

        # Branch for Doubao
        if model_provider == "doubao":
            return self._generate_script_doubao(system_prompt, user_prompt, image_paths)

        # ... Existing Shubiaobiao Logic ...

        # 调试: 检查是否使用了自定义 Prompt
        if system_prompt != self.default_system_prompt:
            logger.info("Using CUSTOM system prompt from database")
        else:
            logger.info("Using DEFAULT system prompt")

        # 2. 构造请求 Payload (Gemini/Shubiaobiao)
        contents = []

        # User message parts
        user_parts = [{"text": user_prompt}]

        # 添加图片 (Multimodal input)
        if image_paths:
            for path in image_paths[:10]: # 限制10张，Gemini-3-Pro 支持多图
                if Path(path).exists():
                    try:
                        b64_img = self._encode_image(path)
                        user_parts.append({
                            "inline_data": {
                                "mime_type": "image/jpeg", # 假设是 JPG/PNG
                                "data": b64_img
                            }
                        })
                    except Exception as e:
                        logger.warning(f"Failed to encode image {path}: {e}")

        contents.append({
            "role": "user",
            "parts": user_parts
        })

        # System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
        user_parts.insert(0, {"text": system_prompt})

        payload = {
            "contents": contents,
            "generationConfig": {
                "response_mime_type": "application/json",
                "temperature": 0.7
            }
        }

        headers = {
            "x-goog-api-key": self.api_key,
            "Content-Type": "application/json"
        }

        # 3. 调用 API
        try:
            response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
            response.raise_for_status()

            result = response.json()

            # 4. 解析结果
            if "candidates" in result and result["candidates"]:
                content_text = result["candidates"][0]["content"]["parts"][0]["text"]

                # 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
                script_json = self._extract_json_from_response(content_text)

                if script_json is None:
                    logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
                    return None

                final_script = self._validate_and_fix_script(script_json)

                # Add Debug Info (包含原始输出)
                final_script["_debug"] = {
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "raw_output": content_text,
                    "provider": "shubiaobiao"
                }
                return final_script
            else:
                logger.error(f"No candidates in response: {result}")
                return None

        except Exception as e:
            logger.error(f"Script generation failed: {e}")
            if 'response' in locals():
                logger.error(f"Response content: {response.text}")
            return None

    def _generate_script_doubao(
        self,
        system_prompt: str,
        user_prompt: str,
        image_paths: List[str]
    ) -> Dict[str, Any]:
        """Doubao 脚本生成实现 (Multimodal)"""
        # User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
        # But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
        # User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]

        endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
        # However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
        # If that fails or if user insists on the specific structure, we can adapt.
        # Volcengine 'ep-...' models are usually served via standard /chat/completions.

        # Let's try standard OpenAI format which Doubao supports perfectly.

        messages = [
            {"role": "system", "content": system_prompt}
        ]

        user_content = []

        # Add Images (Doubao Vision supports image_url)
        if image_paths:
            for path in image_paths[:5]: # Limit
                if os.path.exists(path):
                    # For Volcengine, need to upload or use base64?
                    # Standard OpenAI format supports base64 data urls.
                    # "image_url": {"url": "data:image/jpeg;base64,..."}
                    try:
                        b64_img = self._encode_image(path)
                        user_content.append({
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{b64_img}"
                            }
                        })
                    except Exception as e:
                        logger.warning(f"Failed to encode image for Doubao: {e}")

        # Add Text
        user_content.append({"type": "text", "text": user_prompt})

        messages.append({
            "role": "user",
            "content": user_content
        })

        payload = {
            "model": config.DOUBAO_SCRIPT_MODEL,
            "messages": messages,
            "stream": False,
            # "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
        }

        headers = {
            "Authorization": f"Bearer {config.VOLC_API_KEY}",
            "Content-Type": "application/json"
        }

        try:
            # Try standard chat/completions first
            resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)

            if resp.status_code != 200:
                # If 404, maybe endpoint is wrong, try the user's 'responses' endpoint?
                # But 'responses' usually implies a different payload structure.
                logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
                # Fallback to user provided structure if needed (implement later if this fails)
                resp.raise_for_status()

            result = resp.json()
            content_text = result["choices"][0]["message"]["content"]

            script_json = self._extract_json_from_response(content_text)

            if script_json is None:
                logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
                return None

            final_script = self._validate_and_fix_script(script_json)
            final_script["_debug"] = {
                "system_prompt": system_prompt,
                "user_prompt": user_prompt,
                "raw_output": content_text,
                "provider": "doubao"
            }
            return final_script

        except Exception as e:
            logger.error(f"Doubao script generation failed: {e}")
            if 'resp' in locals():
                logger.error(f"Response: {resp.text}")
            return None

    def _extract_json_from_response(self, text: str) -> Optional[Dict]:
        """
        从 API 响应中提取 JSON 对象
        支持：
        1. 纯 JSON 响应
        2. Markdown 代码块包裹的 JSON (```json ... ```)
        3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
        """
        import re

        # 方法1: 尝试直接解析（纯 JSON 情况）
        try:
            return json.loads(text.strip())
        except json.JSONDecodeError:
            pass

        # 方法2: 提取 ```json ... ``` 代码块
        json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
        if json_block_match:
            try:
                return json.loads(json_block_match.group(1))
            except json.JSONDecodeError as e:
                logger.warning(f"JSON block found but parse failed: {e}")

        # 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
        code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
        if code_block_match:
            try:
                return json.loads(code_block_match.group(1))
            except json.JSONDecodeError:
                pass

        # 方法4: 找到第一个 { 和最后一个 } 之间的内容
        first_brace = text.find('{')
        last_brace = text.rfind('}')
        if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
            try:
                return json.loads(text[first_brace:last_brace + 1])
            except json.JSONDecodeError as e:
                logger.warning(f"Brace extraction failed: {e}")

        return None

    def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
        # 提取商家偏好提示
        style_hint = product_info.get("style_hint", "")
        # 过滤掉不需要展示的字段
        filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
        info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])

        prompt = f"""
商品名称：{product_name}
商品信息：
{info_str}
"""
        if style_hint:
            prompt += f"""
## 商家特别要求
{style_hint}
"""
        prompt += "\n请根据以上信息设计视频脚本。"
        return prompt

    def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
        """校验并修复脚本结构"""
        # 简单校验，确保必要字段存在
        if "scenes" not in script:
            script["scenes"] = []
        return script