feat: video-flow initial commit
- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
This commit is contained in:
390
modules/script_gen.py
Normal file
390
modules/script_gen.py
Normal file
@@ -0,0 +1,390 @@
|
||||
"""
|
||||
脚本生成模块 (Gemini-3-Pro)
|
||||
负责解析商品信息,生成分镜脚本
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import config
|
||||
from modules.db_manager import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ScriptGenerator:
|
||||
"""分镜脚本生成器"""
|
||||
|
||||
def __init__(self):
|
||||
self.api_key = config.SHUBIAOBIAO_KEY
|
||||
# 注意:API 地址可能需要适配 gemini-3-pro-preview 的具体路径
|
||||
# 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
|
||||
# 这里我们先假设 base_url 是 v1beta/models/
|
||||
self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"
|
||||
|
||||
# Default System Prompt
|
||||
self.default_system_prompt = """
|
||||
你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片,设计一个高转化率的商品详情页首图视频脚本。
|
||||
|
||||
## 目标
|
||||
- 提升商品详情页的 GPM 和下单转化率
|
||||
- 视频时长 9-12 秒 (由 3-4 个分镜组成)
|
||||
- **每个分镜时长固定为 3 秒** (duration: 3),不要超过 3 秒
|
||||
- 必须包含:目标人群分析、卖点提炼、分镜设计
|
||||
|
||||
## 分镜设计原则
|
||||
1. **单分镜单主体**:每个分镜聚焦一个视觉主体或动作,避免复杂运镜,因为 AI 生视频在长时间(>3秒)容易出现画面异常。
|
||||
2. **旁白跨分镜**:一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中,通过 start_time 和 duration (秒) 控制旁白的绝对时间位置,无需与分镜一一对应。
|
||||
3. **节奏感**:分镜之间保持视觉连贯,通过景别变化(特写 -> 中景 -> 全景)制造节奏。
|
||||
4. **语速控制**:旁白语速约 4 字/秒,12字旁白约需 3 秒。
|
||||
|
||||
## 输出格式要求 (JSON)
|
||||
必须严格遵守以下 JSON 结构:
|
||||
{
|
||||
"product_name": "商品名称",
|
||||
"visual_anchor": "商品视觉锚点:材质+颜色+形状+包装特征(用于保持生图一致性)",
|
||||
"selling_points": ["卖点1", "卖点2"],
|
||||
"target_audience": "目标人群描述",
|
||||
"video_style": "视频风格关键词",
|
||||
"bgm_style": "BGM风格关键词",
|
||||
"voiceover_timeline": [
|
||||
{
|
||||
"id": 1,
|
||||
"text": "旁白文案片段1(可横跨多个分镜)",
|
||||
"subtitle": "字幕文案1 (简短有力)",
|
||||
"start_time": 0.0,
|
||||
"duration": 3.0
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "旁白文案片段2",
|
||||
"subtitle": "字幕文案2",
|
||||
"start_time": 3.5,
|
||||
"duration": 2.5
|
||||
}
|
||||
],
|
||||
"scenes": [
|
||||
{
|
||||
"id": 1,
|
||||
"duration": 3,
|
||||
"visual_prompt": "详细的画面描述,用于AI生图,包含主体、背景、构图、光影。英文描述。",
|
||||
"video_prompt": "详细的动效描述,用于AI图生视频。英文描述。",
|
||||
"fancy_text": {
|
||||
"text": "花字文案 (最多6字)",
|
||||
"style": "highlight",
|
||||
"position": "center",
|
||||
"start_time": 0.5,
|
||||
"duration": 2.0
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
## 注意事项
|
||||
1. **visual_prompt**:
|
||||
- 必须是英文。
|
||||
- 描述要具体,例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
|
||||
- **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo(这是商品真实外观的一部分)。
|
||||
- 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
|
||||
- **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
|
||||
2. **video_prompt**: 必须是英文,描述动作,例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单,避免复杂运镜和人体动作。
|
||||
3. **voiceover_timeline**:
|
||||
- 这是整个视频的旁白和字幕时间轴,独立于分镜。
|
||||
- `start_time` 是旁白开始的绝对时间 (秒),`duration` 是旁白持续时长 (秒)。
|
||||
- **一段旁白可以横跨多个分镜**,例如:总时长 9 秒 (3 个分镜),一段旁白从 start_time=0,duration=5,则覆盖前两个分镜。
|
||||
- 两段旁白之间留 0.3-0.5 秒间隙(气口)。
|
||||
4. **fancy_text**:
|
||||
- 花字要精简(最多 6 字),突出卖点。
|
||||
- **Style Selection**:
|
||||
- `highlight`: 默认样式,适合通用卖点 (Yellow/Black)。
|
||||
- `warning`: 强调痛点或食欲 (Red/White)。
|
||||
- `price`: 价格显示 (Big Red)。
|
||||
- `bubble`: 旁白补充或用户评价 (Bubble)。
|
||||
- `minimal`: 高级感,适合时尚类 (Thin/White)。
|
||||
- `tech`: 数码类 (Cyan/Glow)。
|
||||
- `position` 默认 `center`,可选 top/bottom/top-left/bottom-right 等。
|
||||
5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
|
||||
"""
|
||||
|
||||
def _encode_image(self, image_path: str) -> str:
|
||||
"""读取图片并转为 Base64"""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
def generate_script(
|
||||
self,
|
||||
product_name: str,
|
||||
product_info: Dict[str, Any],
|
||||
image_paths: List[str] = None,
|
||||
model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
生成分镜脚本
|
||||
"""
|
||||
logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")
|
||||
|
||||
# 1. 构造 Prompt (优先从数据库读取配置)
|
||||
system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
|
||||
user_prompt = self._build_user_prompt(product_name, product_info)
|
||||
|
||||
# Branch for Doubao
|
||||
if model_provider == "doubao":
|
||||
return self._generate_script_doubao(system_prompt, user_prompt, image_paths)
|
||||
|
||||
# ... Existing Shubiaobiao Logic ...
|
||||
|
||||
# 调试: 检查是否使用了自定义 Prompt
|
||||
if system_prompt != self.default_system_prompt:
|
||||
logger.info("Using CUSTOM system prompt from database")
|
||||
else:
|
||||
logger.info("Using DEFAULT system prompt")
|
||||
|
||||
# 2. 构造请求 Payload (Gemini/Shubiaobiao)
|
||||
contents = []
|
||||
|
||||
# User message parts
|
||||
user_parts = [{"text": user_prompt}]
|
||||
|
||||
# 添加图片 (Multimodal input)
|
||||
if image_paths:
|
||||
for path in image_paths[:10]: # 限制10张,Gemini-3-Pro 支持多图
|
||||
if Path(path).exists():
|
||||
try:
|
||||
b64_img = self._encode_image(path)
|
||||
user_parts.append({
|
||||
"inline_data": {
|
||||
"mime_type": "image/jpeg", # 假设是 JPG/PNG
|
||||
"data": b64_img
|
||||
}
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to encode image {path}: {e}")
|
||||
|
||||
contents.append({
|
||||
"role": "user",
|
||||
"parts": user_parts
|
||||
})
|
||||
|
||||
# System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
|
||||
user_parts.insert(0, {"text": system_prompt})
|
||||
|
||||
payload = {
|
||||
"contents": contents,
|
||||
"generationConfig": {
|
||||
"response_mime_type": "application/json",
|
||||
"temperature": 0.7
|
||||
}
|
||||
}
|
||||
|
||||
headers = {
|
||||
"x-goog-api-key": self.api_key,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# 3. 调用 API
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
# 4. 解析结果
|
||||
if "candidates" in result and result["candidates"]:
|
||||
content_text = result["candidates"][0]["content"]["parts"][0]["text"]
|
||||
|
||||
# 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
|
||||
script_json = self._extract_json_from_response(content_text)
|
||||
|
||||
if script_json is None:
|
||||
logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
|
||||
return None
|
||||
|
||||
final_script = self._validate_and_fix_script(script_json)
|
||||
|
||||
# Add Debug Info (包含原始输出)
|
||||
final_script["_debug"] = {
|
||||
"system_prompt": system_prompt,
|
||||
"user_prompt": user_prompt,
|
||||
"raw_output": content_text,
|
||||
"provider": "shubiaobiao"
|
||||
}
|
||||
return final_script
|
||||
else:
|
||||
logger.error(f"No candidates in response: {result}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Script generation failed: {e}")
|
||||
if 'response' in locals():
|
||||
logger.error(f"Response content: {response.text}")
|
||||
return None
|
||||
|
||||
def _generate_script_doubao(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
image_paths: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Doubao 脚本生成实现 (Multimodal)"""
|
||||
# User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
|
||||
# But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
|
||||
# User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]
|
||||
|
||||
endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
|
||||
# However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
|
||||
# If that fails or if user insists on the specific structure, we can adapt.
|
||||
# Volcengine 'ep-...' models are usually served via standard /chat/completions.
|
||||
|
||||
# Let's try standard OpenAI format which Doubao supports perfectly.
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt}
|
||||
]
|
||||
|
||||
user_content = []
|
||||
|
||||
# Add Images (Doubao Vision supports image_url)
|
||||
if image_paths:
|
||||
for path in image_paths[:5]: # Limit
|
||||
if os.path.exists(path):
|
||||
# For Volcengine, need to upload or use base64?
|
||||
# Standard OpenAI format supports base64 data urls.
|
||||
# "image_url": {"url": "data:image/jpeg;base64,..."}
|
||||
try:
|
||||
b64_img = self._encode_image(path)
|
||||
user_content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{b64_img}"
|
||||
}
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to encode image for Doubao: {e}")
|
||||
|
||||
# Add Text
|
||||
user_content.append({"type": "text", "text": user_prompt})
|
||||
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": user_content
|
||||
})
|
||||
|
||||
payload = {
|
||||
"model": config.DOUBAO_SCRIPT_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
# "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {config.VOLC_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
try:
|
||||
# Try standard chat/completions first
|
||||
resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)
|
||||
|
||||
if resp.status_code != 200:
|
||||
# If 404, maybe endpoint is wrong, try the user's 'responses' endpoint?
|
||||
# But 'responses' usually implies a different payload structure.
|
||||
logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
|
||||
# Fallback to user provided structure if needed (implement later if this fails)
|
||||
resp.raise_for_status()
|
||||
|
||||
result = resp.json()
|
||||
content_text = result["choices"][0]["message"]["content"]
|
||||
|
||||
script_json = self._extract_json_from_response(content_text)
|
||||
|
||||
if script_json is None:
|
||||
logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
|
||||
return None
|
||||
|
||||
final_script = self._validate_and_fix_script(script_json)
|
||||
final_script["_debug"] = {
|
||||
"system_prompt": system_prompt,
|
||||
"user_prompt": user_prompt,
|
||||
"raw_output": content_text,
|
||||
"provider": "doubao"
|
||||
}
|
||||
return final_script
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Doubao script generation failed: {e}")
|
||||
if 'resp' in locals():
|
||||
logger.error(f"Response: {resp.text}")
|
||||
return None
|
||||
|
||||
def _extract_json_from_response(self, text: str) -> Optional[Dict]:
|
||||
"""
|
||||
从 API 响应中提取 JSON 对象
|
||||
支持:
|
||||
1. 纯 JSON 响应
|
||||
2. Markdown 代码块包裹的 JSON (```json ... ```)
|
||||
3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
|
||||
"""
|
||||
import re
|
||||
|
||||
# 方法1: 尝试直接解析(纯 JSON 情况)
|
||||
try:
|
||||
return json.loads(text.strip())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 方法2: 提取 ```json ... ``` 代码块
|
||||
json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
|
||||
if json_block_match:
|
||||
try:
|
||||
return json.loads(json_block_match.group(1))
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON block found but parse failed: {e}")
|
||||
|
||||
# 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
|
||||
code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
|
||||
if code_block_match:
|
||||
try:
|
||||
return json.loads(code_block_match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 方法4: 找到第一个 { 和最后一个 } 之间的内容
|
||||
first_brace = text.find('{')
|
||||
last_brace = text.rfind('}')
|
||||
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
|
||||
try:
|
||||
return json.loads(text[first_brace:last_brace + 1])
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Brace extraction failed: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
|
||||
# 提取商家偏好提示
|
||||
style_hint = product_info.get("style_hint", "")
|
||||
# 过滤掉不需要展示的字段
|
||||
filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
|
||||
info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])
|
||||
|
||||
prompt = f"""
|
||||
商品名称:{product_name}
|
||||
商品信息:
|
||||
{info_str}
|
||||
"""
|
||||
if style_hint:
|
||||
prompt += f"""
|
||||
## 商家特别要求
|
||||
{style_hint}
|
||||
"""
|
||||
prompt += "\n请根据以上信息设计视频脚本。"
|
||||
return prompt
|
||||
|
||||
def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""校验并修复脚本结构"""
|
||||
# 简单校验,确保必要字段存在
|
||||
if "scenes" not in script:
|
||||
script["scenes"] = []
|
||||
return script
|
||||
Reference in New Issue
Block a user