feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow
- main_flow.py: CLI tool with argparse support
- modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.)
- config.py: Configuration with API keys and paths
- requirements.txt: Python dependencies
- docs/: System prompt documentation
This commit is contained in:
Tony Zhang
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions

346
modules/brain.py Normal file
View File

@@ -0,0 +1,346 @@
"""
MatchMe Studio - Brain Module (Multi-stage Analysis & Script Generation)
"""
import json
import logging
from typing import Dict, Any, List, Optional
from openai import OpenAI
import config
logger = logging.getLogger(__name__)
# Use Volcengine (Doubao) via OpenAI Compatible Interface
client = OpenAI(
api_key=config.VOLC_API_KEY,
base_url=config.VOLC_BASE_URL
)
# ============================================================
# Stage 1: Analyze Materials
# ============================================================
ANALYZE_SYSTEM_PROMPT = """你是一位资深短视频创作总监专精TikTok/抖音爆款内容。
任务:深度分析用户提供的素材和需求,识别产品特性、使用场景、目标人群。
分析维度:
1. 产品/服务核心卖点(从素材中提取视觉特征)
2. 视觉风格特征(颜色、质感、包装)
3. 潜在目标受众
4. 内容调性建议
然后检查是否缺少关键信息如果缺少生成2-5个问题帮助完善需求。
每个问题必须与短视频创作直接相关。
输出严格JSON格式
{
"analysis": "详细分析结果,包括从素材中识别到的视觉元素...",
"detected_info": {
"product": "识别到的产品名称和类型",
"visual_features": ["视觉特征1", "视觉特征2"],
"audience": "推测的目标人群",
"style": "推测的风格"
},
"missing_info": ["缺少的信息1", "缺少的信息2"],
"questions": [
{
"id": "q1",
"text": "问题文字(说明为什么这个问题重要)",
"options": ["选项A", "选项B", "选项C"],
"allow_multiple": true,
"allow_custom": true
}
],
"ready": false
}
如果信息足够ready=truequestions为空数组。
"""
def analyze_materials(
prompt: str,
image_urls: List[str] = None,
asr_text: str = ""
) -> Dict[str, Any]:
"""
Deep analysis of user materials.
Returns analysis text and questions if info is missing.
"""
logger.info("Brain: Analyzing materials...")
# Using Vision Model format (Doubao Vision)
# Input format: messages with content list (text + image_url)
content_parts = [{"type": "text", "text": f"用户需求: {prompt}"}]
if asr_text:
content_parts.append({"type": "text", "text": f"\n视频原声(ASR转写): {asr_text}"})
if image_urls:
content_parts.append({"type": "text", "text": "\n用户上传的素材图片(请仔细分析这些图片中的产品特征):"})
for url in image_urls:
content_parts.append({
"type": "image_url",
"image_url": {"url": url}
})
messages = [
# Note: Some vision models might not support 'system' role with images well,
# but Doubao usually follows standard chat structure.
# If system prompt fails, prepend it to user content.
{"role": "system", "content": ANALYZE_SYSTEM_PROMPT},
{"role": "user", "content": content_parts}
]
try:
# Use Vision Model for Analysis
response = client.chat.completions.create(
model=config.VISION_MODEL_ID,
messages=messages,
temperature=0.7,
max_tokens=4000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Analyze Error: {e}")
raise
# ============================================================
# Stage 2: Refine Brief with Answers
# ============================================================
REFINE_SYSTEM_PROMPT = """你是短视频创作总监。
根据原始需求、AI分析结果、用户补充回答整合为完整的创意简报。
注意用户选择的风格偏好如ASMR、剧情、视觉流等必须作为核心创作方向贯穿整个简报。
输出JSON:
{
"brief": {
"product": "产品名称",
"product_visual_description": "产品视觉描述(颜色、形状、包装、质感等,用于后续图片生成)",
"selling_points": ["卖点1", "卖点2"],
"target_audience": "目标人群",
"platform": "投放平台",
"style": "视频风格必须明确如ASMR/剧情/视觉流等)",
"style_requirements": "该风格的具体创作要求如ASMR需要开盖声、质感特写、无人脸等",
"creativity_level": "创意程度",
"reference": "对标账号/竞品",
"user_assets_description": "用户上传素材的描述(用于后续继承)"
},
"creative_summary": "整体创意概述50字以内描述这个视频的核心创意方向",
"ready": true
}
"""
def refine_brief(
original_prompt: str,
analysis: Dict[str, Any],
answers: Dict[str, Any],
image_urls: List[str] = None
) -> Dict[str, Any]:
"""
Integrate user answers into a complete creative brief.
"""
logger.info("Brain: Refining brief with answers...")
user_content = f"""
原始需求: {original_prompt}
AI分析结果: {json.dumps(analysis, ensure_ascii=False)}
用户补充回答: {json.dumps(answers, ensure_ascii=False)}
用户上传的素材URL: {json.dumps(image_urls or [], ensure_ascii=False)}
"""
try:
# Use Text LLM for reasoning/refining if no new images involved
# But to keep it simple, we can stick to BRAIN_MODEL_ID (Doubao Pro)
response = client.chat.completions.create(
model=config.BRAIN_MODEL_ID,
messages=[
{"role": "system", "content": REFINE_SYSTEM_PROMPT},
{"role": "user", "content": user_content}
],
temperature=0.5,
max_tokens=3000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Refine Error: {e}")
raise
# ============================================================
# Stage 3: Generate Script
# ============================================================
SCRIPT_SYSTEM_PROMPT = """你是顶级短视频编导,专精{style}风格内容创作。
根据创意简报生成爆款脚本。必须严格遵循用户选择的风格要求。
脚本结构要求:
1. creative_summary: 整体创意概述(这条视频的核心创意是什么)
2. hook: 前3秒钩子设计必须抓眼球符合{style}风格)
3. scenes: 3-8个分镜
4. cta: 结尾行动号召(纯文本字符串)
每个分镜(scene)必须包含:
- id: 分镜编号
- duration: 时长(5/10/15秒符合视频模型参数)
- timeline: 时间轴 (如 "0:00-0:05")
- image_prompt: 【关键】用于AI生图的详细英文prompt必须包含
* 产品的具体视觉描述继承自brief中的product_visual_description
* 8k, hyper-realistic, cinematic lighting
* 色调、环境、构图、焦点
* 风格要求如ASMR需要macro shot, satisfying texture, no human face
- keyframe: {
"color_tone": "色调",
"environment": "环境/背景",
"foreground": "前景元素",
"focus": "视觉焦点",
"subject": "主体描述",
"composition": "构图方式"
}
- camera_movement: 运镜描述slow zoom in, pan left, static
- story_beat: 这个分镜在整体故事中的作用
- voiceover: 旁白文字({style}风格如ASMR应简短或无旁白用音效代替
- sound_design: 音效设计(如:开盖声、水滴声、环境白噪音)
- rhythm: {"change": "保持/加快/放慢", "multiplier": 1.0}
旁白要求:
- 必须连贯,形成完整的叙事
- 符合{style}风格ASMR风格应极简或无旁白
- 每句旁白要能独立成句,但连起来是完整故事
输出严格JSON格式。
"""
def generate_script(
brief: Dict[str, Any],
image_urls: List[str] = None,
regenerate_feedback: str = ""
) -> Dict[str, Any]:
"""
Generate complete video script with scenes.
"""
logger.info("Brain: Generating script...")
style = brief.get("style", "现代广告")
system_prompt = SCRIPT_SYSTEM_PROMPT.replace("{style}", style)
content_parts = [{"type": "text", "text": f"创意简报: {json.dumps(brief, ensure_ascii=False)}"}]
if regenerate_feedback:
content_parts.append({"type": "text", "text": f"\n用户反馈(请据此调整): {regenerate_feedback}"})
if image_urls:
content_parts.append({"type": "text", "text": "\n用户上传的参考素材生成的image_prompt必须参考这些素材中的产品外观:"})
for url in image_urls:
content_parts.append({
"type": "image_url",
"image_url": {"url": url}
})
try:
response = client.chat.completions.create(
model=config.VISION_MODEL_ID, # Use Vision model to see reference images if available
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content_parts}
],
temperature=0.8,
max_tokens=8000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Script Error: {e}")
raise
# ============================================================
# Stage 4: Regenerate Single Scene
# ============================================================
def regenerate_scene(
full_script: Dict[str, Any],
scene_id: int,
feedback: str,
brief: Dict[str, Any] = None
) -> Dict[str, Any]:
"""
Regenerate a single scene based on feedback.
"""
logger.info(f"Brain: Regenerating scene {scene_id}...")
style = brief.get("style", "现代广告") if brief else "现代广告"
system_prompt = f"""你是短视频编导,专精{style}风格。根据用户反馈重新生成指定分镜。
保持与其他分镜的风格连贯性。
image_prompt必须继承产品的视觉描述。
只输出新的scene对象(JSON)。
"""
user_content = f"""
完整脚本: {json.dumps(full_script, ensure_ascii=False)}
创意简报: {json.dumps(brief, ensure_ascii=False) if brief else ""}
需要重新生成的分镜ID: {scene_id}
用户反馈: {feedback}
"""
try:
response = client.chat.completions.create(
model=config.BRAIN_MODEL_ID,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content}
],
temperature=0.8,
max_tokens=2000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Regenerate Scene Error: {e}")
raise