- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
391 lines
16 KiB
Python
391 lines
16 KiB
Python
"""
|
||
脚本生成模块 (Gemini-3-Pro)
|
||
负责解析商品信息,生成分镜脚本
|
||
"""
|
||
import base64
|
||
import json
|
||
import logging
|
||
import os
|
||
import requests
|
||
from typing import Dict, Any, List, Optional
|
||
from pathlib import Path
|
||
|
||
import config
|
||
from modules.db_manager import db
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class ScriptGenerator:
|
||
"""分镜脚本生成器"""
|
||
|
||
def __init__(self):
|
||
self.api_key = config.SHUBIAOBIAO_KEY
|
||
# 注意:API 地址可能需要适配 gemini-3-pro-preview 的具体路径
|
||
# 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
|
||
# 这里我们先假设 base_url 是 v1beta/models/
|
||
self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"
|
||
|
||
# Default System Prompt
|
||
self.default_system_prompt = """
|
||
你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片,设计一个高转化率的商品详情页首图视频脚本。
|
||
|
||
## 目标
|
||
- 提升商品详情页的 GPM 和下单转化率
|
||
- 视频时长 9-12 秒 (由 3-4 个分镜组成)
|
||
- **每个分镜时长固定为 3 秒** (duration: 3),不要超过 3 秒
|
||
- 必须包含:目标人群分析、卖点提炼、分镜设计
|
||
|
||
## 分镜设计原则
|
||
1. **单分镜单主体**:每个分镜聚焦一个视觉主体或动作,避免复杂运镜,因为 AI 生视频在长时间(>3秒)容易出现画面异常。
|
||
2. **旁白跨分镜**:一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中,通过 start_time 和 duration (秒) 控制旁白的绝对时间位置,无需与分镜一一对应。
|
||
3. **节奏感**:分镜之间保持视觉连贯,通过景别变化(特写 -> 中景 -> 全景)制造节奏。
|
||
4. **语速控制**:旁白语速约 4 字/秒,12字旁白约需 3 秒。
|
||
|
||
## 输出格式要求 (JSON)
|
||
必须严格遵守以下 JSON 结构:
|
||
{
|
||
"product_name": "商品名称",
|
||
"visual_anchor": "商品视觉锚点:材质+颜色+形状+包装特征(用于保持生图一致性)",
|
||
"selling_points": ["卖点1", "卖点2"],
|
||
"target_audience": "目标人群描述",
|
||
"video_style": "视频风格关键词",
|
||
"bgm_style": "BGM风格关键词",
|
||
"voiceover_timeline": [
|
||
{
|
||
"id": 1,
|
||
"text": "旁白文案片段1(可横跨多个分镜)",
|
||
"subtitle": "字幕文案1 (简短有力)",
|
||
"start_time": 0.0,
|
||
"duration": 3.0
|
||
},
|
||
{
|
||
"id": 2,
|
||
"text": "旁白文案片段2",
|
||
"subtitle": "字幕文案2",
|
||
"start_time": 3.5,
|
||
"duration": 2.5
|
||
}
|
||
],
|
||
"scenes": [
|
||
{
|
||
"id": 1,
|
||
"duration": 3,
|
||
"visual_prompt": "详细的画面描述,用于AI生图,包含主体、背景、构图、光影。英文描述。",
|
||
"video_prompt": "详细的动效描述,用于AI图生视频。英文描述。",
|
||
"fancy_text": {
|
||
"text": "花字文案 (最多6字)",
|
||
"style": "highlight",
|
||
"position": "center",
|
||
"start_time": 0.5,
|
||
"duration": 2.0
|
||
}
|
||
}
|
||
]
|
||
}
|
||
|
||
## 注意事项
|
||
1. **visual_prompt**:
|
||
- 必须是英文。
|
||
- 描述要具体,例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
|
||
- **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo(这是商品真实外观的一部分)。
|
||
- 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
|
||
- **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
|
||
2. **video_prompt**: 必须是英文,描述动作,例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单,避免复杂运镜和人体动作。
|
||
3. **voiceover_timeline**:
|
||
- 这是整个视频的旁白和字幕时间轴,独立于分镜。
|
||
- `start_time` 是旁白开始的绝对时间 (秒),`duration` 是旁白持续时长 (秒)。
|
||
- **一段旁白可以横跨多个分镜**,例如:总时长 9 秒 (3 个分镜),一段旁白从 start_time=0,duration=5,则覆盖前两个分镜。
|
||
- 两段旁白之间留 0.3-0.5 秒间隙(气口)。
|
||
4. **fancy_text**:
|
||
- 花字要精简(最多 6 字),突出卖点。
|
||
- **Style Selection**:
|
||
- `highlight`: 默认样式,适合通用卖点 (Yellow/Black)。
|
||
- `warning`: 强调痛点或食欲 (Red/White)。
|
||
- `price`: 价格显示 (Big Red)。
|
||
- `bubble`: 旁白补充或用户评价 (Bubble)。
|
||
- `minimal`: 高级感,适合时尚类 (Thin/White)。
|
||
- `tech`: 数码类 (Cyan/Glow)。
|
||
- `position` 默认 `center`,可选 top/bottom/top-left/bottom-right 等。
|
||
5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
|
||
"""
|
||
|
||
def _encode_image(self, image_path: str) -> str:
|
||
"""读取图片并转为 Base64"""
|
||
with open(image_path, "rb") as image_file:
|
||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||
|
||
def generate_script(
|
||
self,
|
||
product_name: str,
|
||
product_info: Dict[str, Any],
|
||
image_paths: List[str] = None,
|
||
model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
生成分镜脚本
|
||
"""
|
||
logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")
|
||
|
||
# 1. 构造 Prompt (优先从数据库读取配置)
|
||
system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
|
||
user_prompt = self._build_user_prompt(product_name, product_info)
|
||
|
||
# Branch for Doubao
|
||
if model_provider == "doubao":
|
||
return self._generate_script_doubao(system_prompt, user_prompt, image_paths)
|
||
|
||
# ... Existing Shubiaobiao Logic ...
|
||
|
||
# 调试: 检查是否使用了自定义 Prompt
|
||
if system_prompt != self.default_system_prompt:
|
||
logger.info("Using CUSTOM system prompt from database")
|
||
else:
|
||
logger.info("Using DEFAULT system prompt")
|
||
|
||
# 2. 构造请求 Payload (Gemini/Shubiaobiao)
|
||
contents = []
|
||
|
||
# User message parts
|
||
user_parts = [{"text": user_prompt}]
|
||
|
||
# 添加图片 (Multimodal input)
|
||
if image_paths:
|
||
for path in image_paths[:10]: # 限制10张,Gemini-3-Pro 支持多图
|
||
if Path(path).exists():
|
||
try:
|
||
b64_img = self._encode_image(path)
|
||
user_parts.append({
|
||
"inline_data": {
|
||
"mime_type": "image/jpeg", # 假设是 JPG/PNG
|
||
"data": b64_img
|
||
}
|
||
})
|
||
except Exception as e:
|
||
logger.warning(f"Failed to encode image {path}: {e}")
|
||
|
||
contents.append({
|
||
"role": "user",
|
||
"parts": user_parts
|
||
})
|
||
|
||
# System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
|
||
user_parts.insert(0, {"text": system_prompt})
|
||
|
||
payload = {
|
||
"contents": contents,
|
||
"generationConfig": {
|
||
"response_mime_type": "application/json",
|
||
"temperature": 0.7
|
||
}
|
||
}
|
||
|
||
headers = {
|
||
"x-goog-api-key": self.api_key,
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
# 3. 调用 API
|
||
try:
|
||
response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
|
||
response.raise_for_status()
|
||
|
||
result = response.json()
|
||
|
||
# 4. 解析结果
|
||
if "candidates" in result and result["candidates"]:
|
||
content_text = result["candidates"][0]["content"]["parts"][0]["text"]
|
||
|
||
# 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
|
||
script_json = self._extract_json_from_response(content_text)
|
||
|
||
if script_json is None:
|
||
logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
|
||
return None
|
||
|
||
final_script = self._validate_and_fix_script(script_json)
|
||
|
||
# Add Debug Info (包含原始输出)
|
||
final_script["_debug"] = {
|
||
"system_prompt": system_prompt,
|
||
"user_prompt": user_prompt,
|
||
"raw_output": content_text,
|
||
"provider": "shubiaobiao"
|
||
}
|
||
return final_script
|
||
else:
|
||
logger.error(f"No candidates in response: {result}")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Script generation failed: {e}")
|
||
if 'response' in locals():
|
||
logger.error(f"Response content: {response.text}")
|
||
return None
|
||
|
||
def _generate_script_doubao(
|
||
self,
|
||
system_prompt: str,
|
||
user_prompt: str,
|
||
image_paths: List[str]
|
||
) -> Dict[str, Any]:
|
||
"""Doubao 脚本生成实现 (Multimodal)"""
|
||
# User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
|
||
# But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
|
||
# User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]
|
||
|
||
endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
|
||
# However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
|
||
# If that fails or if user insists on the specific structure, we can adapt.
|
||
# Volcengine 'ep-...' models are usually served via standard /chat/completions.
|
||
|
||
# Let's try standard OpenAI format which Doubao supports perfectly.
|
||
|
||
messages = [
|
||
{"role": "system", "content": system_prompt}
|
||
]
|
||
|
||
user_content = []
|
||
|
||
# Add Images (Doubao Vision supports image_url)
|
||
if image_paths:
|
||
for path in image_paths[:5]: # Limit
|
||
if os.path.exists(path):
|
||
# For Volcengine, need to upload or use base64?
|
||
# Standard OpenAI format supports base64 data urls.
|
||
# "image_url": {"url": "data:image/jpeg;base64,..."}
|
||
try:
|
||
b64_img = self._encode_image(path)
|
||
user_content.append({
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": f"data:image/jpeg;base64,{b64_img}"
|
||
}
|
||
})
|
||
except Exception as e:
|
||
logger.warning(f"Failed to encode image for Doubao: {e}")
|
||
|
||
# Add Text
|
||
user_content.append({"type": "text", "text": user_prompt})
|
||
|
||
messages.append({
|
||
"role": "user",
|
||
"content": user_content
|
||
})
|
||
|
||
payload = {
|
||
"model": config.DOUBAO_SCRIPT_MODEL,
|
||
"messages": messages,
|
||
"stream": False,
|
||
# "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
|
||
}
|
||
|
||
headers = {
|
||
"Authorization": f"Bearer {config.VOLC_API_KEY}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
try:
|
||
# Try standard chat/completions first
|
||
resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)
|
||
|
||
if resp.status_code != 200:
|
||
# If 404, maybe endpoint is wrong, try the user's 'responses' endpoint?
|
||
# But 'responses' usually implies a different payload structure.
|
||
logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
|
||
# Fallback to user provided structure if needed (implement later if this fails)
|
||
resp.raise_for_status()
|
||
|
||
result = resp.json()
|
||
content_text = result["choices"][0]["message"]["content"]
|
||
|
||
script_json = self._extract_json_from_response(content_text)
|
||
|
||
if script_json is None:
|
||
logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
|
||
return None
|
||
|
||
final_script = self._validate_and_fix_script(script_json)
|
||
final_script["_debug"] = {
|
||
"system_prompt": system_prompt,
|
||
"user_prompt": user_prompt,
|
||
"raw_output": content_text,
|
||
"provider": "doubao"
|
||
}
|
||
return final_script
|
||
|
||
except Exception as e:
|
||
logger.error(f"Doubao script generation failed: {e}")
|
||
if 'resp' in locals():
|
||
logger.error(f"Response: {resp.text}")
|
||
return None
|
||
|
||
def _extract_json_from_response(self, text: str) -> Optional[Dict]:
|
||
"""
|
||
从 API 响应中提取 JSON 对象
|
||
支持:
|
||
1. 纯 JSON 响应
|
||
2. Markdown 代码块包裹的 JSON (```json ... ```)
|
||
3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
|
||
"""
|
||
import re
|
||
|
||
# 方法1: 尝试直接解析(纯 JSON 情况)
|
||
try:
|
||
return json.loads(text.strip())
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法2: 提取 ```json ... ``` 代码块
|
||
json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
|
||
if json_block_match:
|
||
try:
|
||
return json.loads(json_block_match.group(1))
|
||
except json.JSONDecodeError as e:
|
||
logger.warning(f"JSON block found but parse failed: {e}")
|
||
|
||
# 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
|
||
code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
|
||
if code_block_match:
|
||
try:
|
||
return json.loads(code_block_match.group(1))
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法4: 找到第一个 { 和最后一个 } 之间的内容
|
||
first_brace = text.find('{')
|
||
last_brace = text.rfind('}')
|
||
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
|
||
try:
|
||
return json.loads(text[first_brace:last_brace + 1])
|
||
except json.JSONDecodeError as e:
|
||
logger.warning(f"Brace extraction failed: {e}")
|
||
|
||
return None
|
||
|
||
def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
|
||
# 提取商家偏好提示
|
||
style_hint = product_info.get("style_hint", "")
|
||
# 过滤掉不需要展示的字段
|
||
filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
|
||
info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])
|
||
|
||
prompt = f"""
|
||
商品名称:{product_name}
|
||
商品信息:
|
||
{info_str}
|
||
"""
|
||
if style_hint:
|
||
prompt += f"""
|
||
## 商家特别要求
|
||
{style_hint}
|
||
"""
|
||
prompt += "\n请根据以上信息设计视频脚本。"
|
||
return prompt
|
||
|
||
def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""校验并修复脚本结构"""
|
||
# 简单校验,确保必要字段存在
|
||
if "scenes" not in script:
|
||
script["scenes"] = []
|
||
return script
|