- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
347 lines
12 KiB
Python
347 lines
12 KiB
Python
"""
|
||
MatchMe Studio - Brain Module (Multi-stage Analysis & Script Generation)
|
||
"""
|
||
import json
|
||
import logging
|
||
from typing import Dict, Any, List, Optional
|
||
from openai import OpenAI
|
||
|
||
import config
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Use Volcengine (Doubao) via OpenAI Compatible Interface
|
||
client = OpenAI(
|
||
api_key=config.VOLC_API_KEY,
|
||
base_url=config.VOLC_BASE_URL
|
||
)
|
||
|
||
# ============================================================
|
||
# Stage 1: Analyze Materials
|
||
# ============================================================
|
||
|
||
ANALYZE_SYSTEM_PROMPT = """你是一位资深短视频创作总监,专精TikTok/抖音爆款内容。
|
||
|
||
任务:深度分析用户提供的素材和需求,识别产品特性、使用场景、目标人群。
|
||
|
||
分析维度:
|
||
1. 产品/服务核心卖点(从素材中提取视觉特征)
|
||
2. 视觉风格特征(颜色、质感、包装)
|
||
3. 潜在目标受众
|
||
4. 内容调性建议
|
||
|
||
然后检查是否缺少关键信息,如果缺少,生成2-5个问题帮助完善需求。
|
||
每个问题必须与短视频创作直接相关。
|
||
|
||
输出严格JSON格式:
|
||
{
|
||
"analysis": "详细分析结果,包括从素材中识别到的视觉元素...",
|
||
"detected_info": {
|
||
"product": "识别到的产品名称和类型",
|
||
"visual_features": ["视觉特征1", "视觉特征2"],
|
||
"audience": "推测的目标人群",
|
||
"style": "推测的风格"
|
||
},
|
||
"missing_info": ["缺少的信息1", "缺少的信息2"],
|
||
"questions": [
|
||
{
|
||
"id": "q1",
|
||
"text": "问题文字(说明为什么这个问题重要)",
|
||
"options": ["选项A", "选项B", "选项C"],
|
||
"allow_multiple": true,
|
||
"allow_custom": true
|
||
}
|
||
],
|
||
"ready": false
|
||
}
|
||
|
||
如果信息足够,ready=true,questions为空数组。
|
||
"""
|
||
|
||
def analyze_materials(
|
||
prompt: str,
|
||
image_urls: List[str] = None,
|
||
asr_text: str = ""
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Deep analysis of user materials.
|
||
Returns analysis text and questions if info is missing.
|
||
"""
|
||
logger.info("Brain: Analyzing materials...")
|
||
|
||
# Using Vision Model format (Doubao Vision)
|
||
# Input format: messages with content list (text + image_url)
|
||
|
||
content_parts = [{"type": "text", "text": f"用户需求: {prompt}"}]
|
||
|
||
if asr_text:
|
||
content_parts.append({"type": "text", "text": f"\n视频原声(ASR转写): {asr_text}"})
|
||
|
||
if image_urls:
|
||
content_parts.append({"type": "text", "text": "\n用户上传的素材图片(请仔细分析这些图片中的产品特征):"})
|
||
for url in image_urls:
|
||
content_parts.append({
|
||
"type": "image_url",
|
||
"image_url": {"url": url}
|
||
})
|
||
|
||
messages = [
|
||
# Note: Some vision models might not support 'system' role with images well,
|
||
# but Doubao usually follows standard chat structure.
|
||
# If system prompt fails, prepend it to user content.
|
||
{"role": "system", "content": ANALYZE_SYSTEM_PROMPT},
|
||
{"role": "user", "content": content_parts}
|
||
]
|
||
|
||
try:
|
||
# Use Vision Model for Analysis
|
||
response = client.chat.completions.create(
|
||
model=config.VISION_MODEL_ID,
|
||
messages=messages,
|
||
temperature=0.7,
|
||
max_tokens=4000
|
||
)
|
||
|
||
content = response.choices[0].message.content.strip()
|
||
if content.startswith("```"):
|
||
parts = content.split("```")
|
||
if len(parts) > 1:
|
||
content = parts[1]
|
||
if content.startswith("json"): content = content[4:]
|
||
|
||
return json.loads(content)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Brain Analyze Error: {e}")
|
||
raise
|
||
|
||
|
||
# ============================================================
|
||
# Stage 2: Refine Brief with Answers
|
||
# ============================================================
|
||
|
||
REFINE_SYSTEM_PROMPT = """你是短视频创作总监。
|
||
根据原始需求、AI分析结果、用户补充回答,整合为完整的创意简报。
|
||
|
||
注意:用户选择的风格偏好(如ASMR、剧情、视觉流等)必须作为核心创作方向贯穿整个简报。
|
||
|
||
输出JSON:
|
||
{
|
||
"brief": {
|
||
"product": "产品名称",
|
||
"product_visual_description": "产品视觉描述(颜色、形状、包装、质感等,用于后续图片生成)",
|
||
"selling_points": ["卖点1", "卖点2"],
|
||
"target_audience": "目标人群",
|
||
"platform": "投放平台",
|
||
"style": "视频风格(必须明确,如ASMR/剧情/视觉流等)",
|
||
"style_requirements": "该风格的具体创作要求(如ASMR需要:开盖声、质感特写、无人脸等)",
|
||
"creativity_level": "创意程度",
|
||
"reference": "对标账号/竞品",
|
||
"user_assets_description": "用户上传素材的描述(用于后续继承)"
|
||
},
|
||
"creative_summary": "整体创意概述(50字以内,描述这个视频的核心创意方向)",
|
||
"ready": true
|
||
}
|
||
"""
|
||
|
||
def refine_brief(
|
||
original_prompt: str,
|
||
analysis: Dict[str, Any],
|
||
answers: Dict[str, Any],
|
||
image_urls: List[str] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Integrate user answers into a complete creative brief.
|
||
"""
|
||
logger.info("Brain: Refining brief with answers...")
|
||
|
||
user_content = f"""
|
||
原始需求: {original_prompt}
|
||
|
||
AI分析结果: {json.dumps(analysis, ensure_ascii=False)}
|
||
|
||
用户补充回答: {json.dumps(answers, ensure_ascii=False)}
|
||
|
||
用户上传的素材URL: {json.dumps(image_urls or [], ensure_ascii=False)}
|
||
"""
|
||
|
||
try:
|
||
# Use Text LLM for reasoning/refining if no new images involved
|
||
# But to keep it simple, we can stick to BRAIN_MODEL_ID (Doubao Pro)
|
||
response = client.chat.completions.create(
|
||
model=config.BRAIN_MODEL_ID,
|
||
messages=[
|
||
{"role": "system", "content": REFINE_SYSTEM_PROMPT},
|
||
{"role": "user", "content": user_content}
|
||
],
|
||
temperature=0.5,
|
||
max_tokens=3000
|
||
)
|
||
|
||
content = response.choices[0].message.content.strip()
|
||
if content.startswith("```"):
|
||
parts = content.split("```")
|
||
if len(parts) > 1:
|
||
content = parts[1]
|
||
if content.startswith("json"): content = content[4:]
|
||
|
||
return json.loads(content)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Brain Refine Error: {e}")
|
||
raise
|
||
|
||
|
||
# ============================================================
|
||
# Stage 3: Generate Script
|
||
# ============================================================
|
||
|
||
SCRIPT_SYSTEM_PROMPT = """你是顶级短视频编导,专精{style}风格内容创作。
|
||
|
||
根据创意简报生成爆款脚本。必须严格遵循用户选择的风格要求。
|
||
|
||
脚本结构要求:
|
||
1. creative_summary: 整体创意概述(这条视频的核心创意是什么)
|
||
2. hook: 前3秒钩子设计(必须抓眼球,符合{style}风格)
|
||
3. scenes: 3-8个分镜
|
||
4. cta: 结尾行动号召(纯文本字符串)
|
||
|
||
每个分镜(scene)必须包含:
|
||
- id: 分镜编号
|
||
- duration: 时长(5/10/15秒,符合视频模型参数)
|
||
- timeline: 时间轴 (如 "0:00-0:05")
|
||
- image_prompt: 【关键】用于AI生图的详细英文prompt,必须包含:
|
||
* 产品的具体视觉描述(继承自brief中的product_visual_description)
|
||
* 8k, hyper-realistic, cinematic lighting
|
||
* 色调、环境、构图、焦点
|
||
* 风格要求(如ASMR需要:macro shot, satisfying texture, no human face)
|
||
- keyframe: {
|
||
"color_tone": "色调",
|
||
"environment": "环境/背景",
|
||
"foreground": "前景元素",
|
||
"focus": "视觉焦点",
|
||
"subject": "主体描述",
|
||
"composition": "构图方式"
|
||
}
|
||
- camera_movement: 运镜描述(如:slow zoom in, pan left, static)
|
||
- story_beat: 这个分镜在整体故事中的作用
|
||
- voiceover: 旁白文字({style}风格,如ASMR应简短或无旁白,用音效代替)
|
||
- sound_design: 音效设计(如:开盖声、水滴声、环境白噪音)
|
||
- rhythm: {"change": "保持/加快/放慢", "multiplier": 1.0}
|
||
|
||
旁白要求:
|
||
- 必须连贯,形成完整的叙事
|
||
- 符合{style}风格(ASMR风格应极简或无旁白)
|
||
- 每句旁白要能独立成句,但连起来是完整故事
|
||
|
||
输出严格JSON格式。
|
||
"""
|
||
|
||
def generate_script(
|
||
brief: Dict[str, Any],
|
||
image_urls: List[str] = None,
|
||
regenerate_feedback: str = ""
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Generate complete video script with scenes.
|
||
"""
|
||
logger.info("Brain: Generating script...")
|
||
|
||
style = brief.get("style", "现代广告")
|
||
system_prompt = SCRIPT_SYSTEM_PROMPT.replace("{style}", style)
|
||
|
||
content_parts = [{"type": "text", "text": f"创意简报: {json.dumps(brief, ensure_ascii=False)}"}]
|
||
|
||
if regenerate_feedback:
|
||
content_parts.append({"type": "text", "text": f"\n用户反馈(请据此调整): {regenerate_feedback}"})
|
||
|
||
if image_urls:
|
||
content_parts.append({"type": "text", "text": "\n用户上传的参考素材(生成的image_prompt必须参考这些素材中的产品外观):"})
|
||
for url in image_urls:
|
||
content_parts.append({
|
||
"type": "image_url",
|
||
"image_url": {"url": url}
|
||
})
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model=config.VISION_MODEL_ID, # Use Vision model to see reference images if available
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": content_parts}
|
||
],
|
||
temperature=0.8,
|
||
max_tokens=8000
|
||
)
|
||
|
||
content = response.choices[0].message.content.strip()
|
||
if content.startswith("```"):
|
||
parts = content.split("```")
|
||
if len(parts) > 1:
|
||
content = parts[1]
|
||
if content.startswith("json"): content = content[4:]
|
||
|
||
return json.loads(content)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Brain Script Error: {e}")
|
||
raise
|
||
|
||
|
||
# ============================================================
|
||
# Stage 4: Regenerate Single Scene
|
||
# ============================================================
|
||
|
||
def regenerate_scene(
|
||
full_script: Dict[str, Any],
|
||
scene_id: int,
|
||
feedback: str,
|
||
brief: Dict[str, Any] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Regenerate a single scene based on feedback.
|
||
"""
|
||
logger.info(f"Brain: Regenerating scene {scene_id}...")
|
||
|
||
style = brief.get("style", "现代广告") if brief else "现代广告"
|
||
|
||
system_prompt = f"""你是短视频编导,专精{style}风格。根据用户反馈重新生成指定分镜。
|
||
保持与其他分镜的风格连贯性。
|
||
image_prompt必须继承产品的视觉描述。
|
||
只输出新的scene对象(JSON)。
|
||
"""
|
||
|
||
user_content = f"""
|
||
完整脚本: {json.dumps(full_script, ensure_ascii=False)}
|
||
|
||
创意简报: {json.dumps(brief, ensure_ascii=False) if brief else "无"}
|
||
|
||
需要重新生成的分镜ID: {scene_id}
|
||
|
||
用户反馈: {feedback}
|
||
"""
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model=config.BRAIN_MODEL_ID,
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_content}
|
||
],
|
||
temperature=0.8,
|
||
max_tokens=2000
|
||
)
|
||
|
||
content = response.choices[0].message.content.strip()
|
||
if content.startswith("```"):
|
||
parts = content.split("```")
|
||
if len(parts) > 1:
|
||
content = parts[1]
|
||
if content.startswith("json"): content = content[4:]
|
||
|
||
return json.loads(content)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Brain Regenerate Scene Error: {e}")
|
||
raise
|