""" 脚本生成模块 (Gemini-3-Pro) 负责解析商品信息,生成分镜脚本 """ import base64 import json import logging import os import requests from typing import Dict, Any, List, Optional from pathlib import Path from openai import OpenAI import config from modules.db_manager import db logger = logging.getLogger(__name__) class ScriptGenerator: """分镜脚本生成器""" def __init__(self): self.api_key = config.SHUBIAOBIAO_KEY # 注意:API 地址可能需要适配 gemini-3-pro-preview 的具体路径 # 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent # 这里我们先假设 base_url 是 v1beta/models/ self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent" # OpenAI-compatible client for ShuBiaoBiao (supports multiple models incl. GPT) self.shubiaobiao_client = OpenAI( api_key=config.SHUBIAOBIAO_KEY, base_url=config.SHUBIAOBIAO_BASE_URL ) # Default System Prompt self.default_system_prompt = """ 你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片,设计一个高转化率的商品详情页首图视频脚本。 ## 目标 - 提升商品详情页的 GPM 和下单转化率 - 视频时长 9-12 秒 (由 3-4 个分镜组成) - **每个分镜时长固定为 3 秒** (duration: 3),不要超过 3 秒 - 必须包含:目标人群分析、卖点提炼、分镜设计 ## 分镜设计原则 1. **单分镜单主体**:每个分镜聚焦一个视觉主体或动作,避免复杂运镜。 2. **旁白跨分镜**:一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中,通过 start_time 和 duration (秒) 控制旁白的绝对时间位置,无需与分镜一一对应。 3. **节奏感**:分镜之间保持视觉连贯,通过景别变化(特写 -> 中景 -> 全景)制造节奏。 4. **语速控制**:旁白语速约 4 字/秒,12字旁白约需 3 秒。 ## 输出格式要求 (JSON) 必须严格遵守以下 JSON 结构: { "product_name": "商品名称", "visual_anchor": "商品视觉锚点:材质+颜色+形状+包装特征(用于保持生图一致性)", "selling_points": ["卖点1", "卖点2"], "target_audience": "目标人群描述", "video_style": "视频风格关键词", "bgm_style": "BGM风格关键词", "voiceover_timeline": [ { "id": 1, "text": "旁白文案片段1(可横跨多个分镜)", "subtitle": "字幕文案1 (简短有力)", "start_time": 0.0, "duration": 3.0 }, { "id": 2, "text": "旁白文案片段2", "subtitle": "字幕文案2", "start_time": 3.5, "duration": 2.5 } ], "scenes": [ { "id": 1, "duration": 3, "visual_prompt": "详细的画面描述,用于AI生图,包含主体、背景、构图、光影。英文描述。", "video_prompt": "详细的动效描述,用于AI图生视频。英文描述。", "fancy_text": { "text": "花字文案 (最多6字)", "style": "highlight", "position": "center", "start_time": 0.5, "duration": 2.0 } } ] } ## 注意事项 1. **visual_prompt (生图提示)**: - 必须是英文。 - 描述要具体,例如 "Close-up shot of a hair clip on a young woman's hair, soft lighting, minimalist background". - **人物出镜规则 (重要)**: - 对于穿戴类商品(服装、饰品、发饰、鞋包、眼镜、手表等):**必须包含人物模特**,展示穿戴效果。 - 例如:发饰 → "A young Asian woman with the hair clip styling her ponytail" - 例如:衣服 → "A stylish young woman wearing the dress, full body shot" - 例如:包包 → "A fashionable woman carrying the handbag on her shoulder" - 对于使用类商品(护肤品、化妆品、电子产品等):**建议包含人物使用场景**。 - 例如:护肤品 → "Close-up of a woman's hand applying the cream to her face" - 对于纯展示类商品(食品、家居摆件等):可以纯产品展示,不强制人物。 - **禁止 AI 额外生成装饰性文字、标语、水印**。但必须保留商品包装自带的文字和 Logo。 - **EMPHASIS**: Strictly follow the appearance of the product in the reference images. 2. **video_prompt (视频动效提示)**: - 必须是英文。 - **动作简单化 (重要)**:AI 生视频容易在复杂动作上出现瑕疵,因此: - ✅ 推荐动作:slow zoom in/out, subtle camera pan, gentle rotation, soft light flickering, particles floating - ✅ 人物推荐:slight head turn, gentle smile, hair flowing softly, holding still with minimal movement - ❌ 避免动作:fast motion, walking, running, dancing, hand gestures, complex body movements, drastic camera movements - 示例: - Good: "Slow zoom in on the hair clip, the woman's hair gently flows in soft breeze" - Bad: "The woman shakes her head dramatically to show the clip stays in place" 3. **voiceover_timeline**: - 这是整个视频的旁白和字幕时间轴,独立于分镜。 - `start_time` 是旁白开始的绝对时间 (秒),`duration` 是旁白持续时长 (秒)。 - **一段旁白可以横跨多个分镜**,例如:总时长 9 秒 (3 个分镜),一段旁白从 start_time=0,duration=5,则覆盖前两个分镜。 - 两段旁白之间留 0.3-0.5 秒间隙(气口)。 4. **fancy_text**: - 花字要精简(最多 6 字),突出卖点。 - **Style Selection**: - `highlight`: 默认样式,适合通用卖点 (Yellow/Black)。 - `warning`: 强调痛点或食欲 (Red/White)。 - `price`: 价格显示 (Big Red)。 - `bubble`: 旁白补充或用户评价 (Bubble)。 - `minimal`: 高级感,适合时尚类 (Thin/White)。 - `tech`: 数码类 (Cyan/Glow)。 - `position` 默认 `center`,可选 top/bottom/top-left/bottom-right 等。 5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。 """ def _encode_image(self, image_path: str) -> str: """读取图片并转为 Base64""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def generate_script( self, product_name: str, product_info: Dict[str, Any], image_paths: List[str] = None, model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao" ) -> Dict[str, Any]: """ 生成分镜脚本 """ logger.info(f"Generating script for: {product_name} (Provider: {model_provider})") # 1. 构造 Prompt (优先从数据库读取配置) system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt) user_prompt = self._build_user_prompt(product_name, product_info) # Branch for Doubao (Volcengine) if model_provider == "doubao": script = self._generate_script_doubao(system_prompt, user_prompt, image_paths) if script: script["selling_points"] = self._postprocess_selling_points(product_info, script.get("selling_points")) return script # Branch for ShuBiaoBiao GPT (OpenAI-compatible multimodal) if model_provider == "shubiaobiao_gpt": script = self._generate_script_shubiaobiao_openai(system_prompt, user_prompt, image_paths, model_name="gpt-5.2") if script: script["selling_points"] = self._postprocess_selling_points(product_info, script.get("selling_points")) return script # Branch for ShuBiaoBiao Gemini (OpenAI-compatible; use image URLs instead of base64) if model_provider == "shubiaobiao": script = self._generate_script_shubiaobiao_openai(system_prompt, user_prompt, image_paths, model_name=config.SHUBIAOBIAO_MODEL_TEXT) if script: script["selling_points"] = self._postprocess_selling_points(product_info, script.get("selling_points")) return script # Fallback (should not normally reach here) # 调试: 检查是否使用了自定义 Prompt if system_prompt != self.default_system_prompt: logger.info("Using CUSTOM system prompt from database") else: logger.info("Using DEFAULT system prompt") # 2. 构造请求 Payload (Gemini/Shubiaobiao) contents = [] # User message parts user_parts = [{"text": user_prompt}] # 添加图片 (Multimodal input) if image_paths: for path in image_paths[:10]: # 限制10张,Gemini-3-Pro 支持多图 if Path(path).exists(): try: b64_img = self._encode_image(path) user_parts.append({ "inline_data": { "mime_type": "image/jpeg", # 假设是 JPG/PNG "data": b64_img } }) except Exception as e: logger.warning(f"Failed to encode image {path}: {e}") contents.append({ "role": "user", "parts": user_parts }) # System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前) user_parts.insert(0, {"text": system_prompt}) payload = { "contents": contents, "generationConfig": { "response_mime_type": "application/json", "temperature": 0.7 } } headers = { "x-goog-api-key": self.api_key, "Content-Type": "application/json" } # 3. 调用 API try: response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60) response.raise_for_status() result = response.json() # 4. 解析结果 if "candidates" in result and result["candidates"]: content_text = result["candidates"][0]["content"]["parts"][0]["text"] # 提取 JSON 部分 (处理 Markdown 代码块或纯文本) script_json = self._extract_json_from_response(content_text) if script_json is None: logger.error(f"Failed to extract JSON from response: {content_text[:500]}...") return None final_script = self._validate_and_fix_script(script_json) # 不改 prompt 的前提下:对卖点做轻量规则化(更具体、更可执行) final_script["selling_points"] = self._postprocess_selling_points(product_info, final_script.get("selling_points")) # Add Debug Info (包含原始输出) final_script["_debug"] = { "system_prompt": system_prompt, "user_prompt": user_prompt, "raw_output": content_text, "provider": "shubiaobiao" } return final_script else: logger.error(f"No candidates in response: {result}") return None except Exception as e: logger.error(f"Script generation failed: {e}") if 'response' in locals(): logger.error(f"Response content: {response.text}") return None def _upload_images_to_r2(self, image_paths: List[str], limit: int = 10) -> List[str]: urls: List[str] = [] if not image_paths: return urls # NOTE: avoid hard import dependency at app startup. # If boto3 / storage is not installed on the runtime, we should not crash Streamlit. try: from modules import storage # lazy import except Exception as e: logger.warning(f"R2 upload disabled (storage/boto3 unavailable): {e}") return urls for p in image_paths[:limit]: try: if p and Path(p).exists(): url = storage.upload_file(str(p)) if url: urls.append(url) except Exception as e: logger.warning(f"Failed to upload script image to R2: {p} ({e})") return urls def _generate_script_shubiaobiao_openai( self, system_prompt: str, user_prompt: str, image_paths: List[str], model_name: str, ) -> Optional[Dict[str, Any]]: """ ShuBiaoBiao OpenAI-compatible multimodal chat. IMPORTANT: For ShuBiaoBiao models, we pass image URLs (R2 public URLs), not base64. """ messages = [{"role": "system", "content": system_prompt}] user_content: List[Dict[str, Any]] = [] # Images first (URL), then text urls = self._upload_images_to_r2(image_paths or [], limit=10) for url in urls: user_content.append({"type": "image_url", "image_url": {"url": url}}) user_content.append({"type": "text", "text": user_prompt}) messages.append({"role": "user", "content": user_content}) try: resp = self.shubiaobiao_client.chat.completions.create( model=model_name, messages=messages, temperature=0.7, ) content_text = (resp.choices[0].message.content or "").strip() script_json = self._extract_json_from_response(content_text) if script_json is None: logger.error(f"Failed to extract JSON from shubiaobiao response({model_name}): {content_text[:500]}...") return None final_script = self._validate_and_fix_script(script_json) final_script["_debug"] = { "system_prompt": system_prompt, "user_prompt": user_prompt, "raw_output": content_text, "provider": f"shubiaobiao:{model_name}", "image_urls": urls, } return final_script except Exception as e: logger.error(f"shubiaobiao script generation failed ({model_name}): {e}") return None def _postprocess_selling_points(self, product_info: Dict[str, Any], selling_points: Any) -> List[str]: """ Engineering-only postprocess (NO prompt change): - De-duplicate - Prefer specific points derived from tags/params when LLM points are too generic """ def _norm_list(v: Any) -> List[str]: if isinstance(v, list): return [str(x).strip() for x in v if str(x).strip()] if isinstance(v, str) and v.strip(): return [v.strip()] return [] tags = str((product_info or {}).get("tags", "") or "") params = str((product_info or {}).get("params", "") or "") category = str((product_info or {}).get("category", "") or "") # Candidates from tags/params/category raw = " ".join([tags, params, category]) parts = [] for sep in ["|", ";", ";", "、", ",", ",", "\n"]: raw = raw.replace(sep, "|") for p in raw.split("|"): p = p.strip() if not p: continue # params like "key:value" if ":" in p: kv = [x.strip() for x in p.split(":", 1)] if len(kv) == 2 and kv[1]: parts.append(kv[1]) else: parts.append(p) else: parts.append(p) # Heuristic: keep more concrete phrases generic_words = [ "好看", "百搭", "高级", "气质", "显白", "好用", "耐看", "时尚", "精致", "很棒", "不错", "喜欢", "推荐", "性价比", "划算", "超值", "必入", "绝了", ] concrete_hints = [ "不掉", "牢固", "防滑", "加厚", "大号", "小号", "强力", "耐用", "稳固", "不勒", "不伤", "树脂", "金属", "水钻", "夹", "材质", "尺寸", "弹簧", "夹力", "发量", "马尾", ] candidates: List[str] = [] seen = set() for p in parts: if not p or p in seen: continue seen.add(p) if any(h in p for h in concrete_hints) and not any(g in p for g in ["喜欢", "推荐"]): candidates.append(p) # fallback: keep non-empty tags if not candidates: candidates = [p for p in parts if p][:8] points = _norm_list(selling_points) # de-dup preserving order out: List[str] = [] used = set() for p in points: if p in used: continue used.add(p) out.append(p) def _is_generic(p: str) -> bool: # treat as generic if only contains generic words and lacks any concrete hints if any(h in p for h in concrete_hints): return False return any(g in p for g in generic_words) # Replace overly generic points with better candidates (keep length <= 3-5 ideally) cand_iter = (c for c in candidates if c not in used) refined: List[str] = [] for p in out: if _is_generic(p): c = next(cand_iter, None) if c: refined.append(c) used.add(c) continue refined.append(p) # Ensure at least 3 selling points if possible while len(refined) < 3: c = next(cand_iter, None) if not c: break refined.append(c) used.add(c) return refined[:5] def _generate_script_doubao( self, system_prompt: str, user_prompt: str, image_paths: List[str] ) -> Dict[str, Any]: """Doubao 脚本生成实现 (Multimodal)""" # User Provided: https://ark.cn-beijing.volces.com/api/v3/responses # But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content. # User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}] endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older # However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible. # If that fails or if user insists on the specific structure, we can adapt. # Volcengine 'ep-...' models are usually served via standard /chat/completions. # Let's try standard OpenAI format which Doubao supports perfectly. messages = [ {"role": "system", "content": system_prompt} ] user_content = [] # Add Images (Doubao Vision supports image_url) if image_paths: for path in image_paths[:5]: # Limit if os.path.exists(path): # For Volcengine, need to upload or use base64? # Standard OpenAI format supports base64 data urls. # "image_url": {"url": "data:image/jpeg;base64,..."} try: b64_img = self._encode_image(path) user_content.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{b64_img}" } }) except Exception as e: logger.warning(f"Failed to encode image for Doubao: {e}") # Add Text user_content.append({"type": "text", "text": user_prompt}) messages.append({ "role": "user", "content": user_content }) payload = { "model": config.DOUBAO_SCRIPT_MODEL, "messages": messages, "stream": False, # "response_format": {"type": "json_object"} # Try enabling JSON mode if supported } headers = { "Authorization": f"Bearer {config.VOLC_API_KEY}", "Content-Type": "application/json" } try: # Try standard chat/completions first resp = requests.post(endpoint, headers=headers, json=payload, timeout=120) if resp.status_code != 200: # If 404, maybe endpoint is wrong, try the user's 'responses' endpoint? # But 'responses' usually implies a different payload structure. logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...") # Fallback to user provided structure if needed (implement later if this fails) resp.raise_for_status() result = resp.json() content_text = result["choices"][0]["message"]["content"] script_json = self._extract_json_from_response(content_text) if script_json is None: logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...") return None final_script = self._validate_and_fix_script(script_json) final_script["_debug"] = { "system_prompt": system_prompt, "user_prompt": user_prompt, "raw_output": content_text, "provider": "doubao" } return final_script except Exception as e: logger.error(f"Doubao script generation failed: {e}") if 'resp' in locals(): logger.error(f"Response: {resp.text}") return None def _extract_json_from_response(self, text: str) -> Optional[Dict]: """ 从 API 响应中提取 JSON 对象 支持: 1. 纯 JSON 响应 2. Markdown 代码块包裹的 JSON (```json ... ```) 3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 }) """ import re # 方法1: 尝试直接解析(纯 JSON 情况) try: return json.loads(text.strip()) except json.JSONDecodeError: pass # 方法2: 提取 ```json ... ``` 代码块 json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text) if json_block_match: try: return json.loads(json_block_match.group(1)) except json.JSONDecodeError as e: logger.warning(f"JSON block found but parse failed: {e}") # 方法3: 提取 ``` ... ``` 代码块 (无 json 标记) code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text) if code_block_match: try: return json.loads(code_block_match.group(1)) except json.JSONDecodeError: pass # 方法4: 找到第一个 { 和最后一个 } 之间的内容 first_brace = text.find('{') last_brace = text.rfind('}') if first_brace != -1 and last_brace != -1 and last_brace > first_brace: try: return json.loads(text[first_brace:last_brace + 1]) except json.JSONDecodeError as e: logger.warning(f"Brace extraction failed: {e}") return None def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str: # 提取商家偏好提示 style_hint = product_info.get("style_hint", "") # 过滤掉不需要展示的字段 filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]} info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()]) prompt = f""" 商品名称:{product_name} 商品信息: {info_str} """ if style_hint: prompt += f""" ## 商家特别要求 {style_hint} """ prompt += "\n请根据以上信息设计视频脚本。" return prompt def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]: """校验并修复脚本结构""" # 简单校验,确保必要字段存在 if "scenes" not in script: script["scenes"] = [] return script