feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow
- main_flow.py: CLI tool with argparse support
- modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.)
- config.py: Configuration with API keys and paths
- requirements.txt: Python dependencies
- docs/: System prompt documentation
This commit is contained in:
Tony Zhang
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions

14
modules/__init__.py Normal file
View File

@@ -0,0 +1,14 @@
"""
Gloda Video Factory - Modules Package
"""
__all__ = [
"utils",
"brain",
"factory",
"editor",
"ffmpeg_utils",
"fancy_text",
"composer"
]

81
modules/asr.py Normal file
View File

@@ -0,0 +1,81 @@
"""
MatchMe Studio - ASR Module (Whisper via ShuBiaoBiao)
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
from openai import OpenAI
import config
logger = logging.getLogger(__name__)
client = OpenAI(
api_key=config.SHUBIAOBIAO_KEY,
base_url=config.SHUBIAOBIAO_BASE_URL
)
def extract_audio_from_video(video_path: str) -> str:
"""Extract audio track from video using ffmpeg."""
video_path = Path(video_path)
audio_path = config.TEMP_DIR / f"{video_path.stem}_audio.mp3"
cmd = [
"ffmpeg", "-y",
"-i", str(video_path),
"-vn", # No video
"-acodec", "libmp3lame",
"-ar", "16000", # 16kHz for Whisper
"-ac", "1", # Mono
str(audio_path)
]
try:
subprocess.run(cmd, check=True, capture_output=True)
logger.info(f"Audio extracted to {audio_path}")
return str(audio_path)
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg error: {e.stderr.decode()}")
raise RuntimeError("Failed to extract audio from video")
def transcribe(audio_path: str) -> str:
"""Transcribe audio to text using Whisper API."""
logger.info(f"Transcribing {audio_path}...")
try:
with open(audio_path, "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="zh", # Chinese
response_format="text"
)
text = response if isinstance(response, str) else response.text
logger.info(f"Transcription complete: {len(text)} chars")
return text
except Exception as e:
logger.error(f"Whisper API error: {e}")
raise
def transcribe_video(video_path: str) -> str:
"""Extract audio from video and transcribe."""
audio_path = extract_audio_from_video(video_path)
return transcribe(audio_path)

346
modules/brain.py Normal file
View File

@@ -0,0 +1,346 @@
"""
MatchMe Studio - Brain Module (Multi-stage Analysis & Script Generation)
"""
import json
import logging
from typing import Dict, Any, List, Optional
from openai import OpenAI
import config
logger = logging.getLogger(__name__)
# Use Volcengine (Doubao) via OpenAI Compatible Interface
client = OpenAI(
api_key=config.VOLC_API_KEY,
base_url=config.VOLC_BASE_URL
)
# ============================================================
# Stage 1: Analyze Materials
# ============================================================
ANALYZE_SYSTEM_PROMPT = """你是一位资深短视频创作总监专精TikTok/抖音爆款内容。
任务:深度分析用户提供的素材和需求,识别产品特性、使用场景、目标人群。
分析维度:
1. 产品/服务核心卖点(从素材中提取视觉特征)
2. 视觉风格特征(颜色、质感、包装)
3. 潜在目标受众
4. 内容调性建议
然后检查是否缺少关键信息如果缺少生成2-5个问题帮助完善需求。
每个问题必须与短视频创作直接相关。
输出严格JSON格式
{
"analysis": "详细分析结果,包括从素材中识别到的视觉元素...",
"detected_info": {
"product": "识别到的产品名称和类型",
"visual_features": ["视觉特征1", "视觉特征2"],
"audience": "推测的目标人群",
"style": "推测的风格"
},
"missing_info": ["缺少的信息1", "缺少的信息2"],
"questions": [
{
"id": "q1",
"text": "问题文字(说明为什么这个问题重要)",
"options": ["选项A", "选项B", "选项C"],
"allow_multiple": true,
"allow_custom": true
}
],
"ready": false
}
如果信息足够ready=truequestions为空数组。
"""
def analyze_materials(
prompt: str,
image_urls: List[str] = None,
asr_text: str = ""
) -> Dict[str, Any]:
"""
Deep analysis of user materials.
Returns analysis text and questions if info is missing.
"""
logger.info("Brain: Analyzing materials...")
# Using Vision Model format (Doubao Vision)
# Input format: messages with content list (text + image_url)
content_parts = [{"type": "text", "text": f"用户需求: {prompt}"}]
if asr_text:
content_parts.append({"type": "text", "text": f"\n视频原声(ASR转写): {asr_text}"})
if image_urls:
content_parts.append({"type": "text", "text": "\n用户上传的素材图片(请仔细分析这些图片中的产品特征):"})
for url in image_urls:
content_parts.append({
"type": "image_url",
"image_url": {"url": url}
})
messages = [
# Note: Some vision models might not support 'system' role with images well,
# but Doubao usually follows standard chat structure.
# If system prompt fails, prepend it to user content.
{"role": "system", "content": ANALYZE_SYSTEM_PROMPT},
{"role": "user", "content": content_parts}
]
try:
# Use Vision Model for Analysis
response = client.chat.completions.create(
model=config.VISION_MODEL_ID,
messages=messages,
temperature=0.7,
max_tokens=4000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Analyze Error: {e}")
raise
# ============================================================
# Stage 2: Refine Brief with Answers
# ============================================================
REFINE_SYSTEM_PROMPT = """你是短视频创作总监。
根据原始需求、AI分析结果、用户补充回答整合为完整的创意简报。
注意用户选择的风格偏好如ASMR、剧情、视觉流等必须作为核心创作方向贯穿整个简报。
输出JSON:
{
"brief": {
"product": "产品名称",
"product_visual_description": "产品视觉描述(颜色、形状、包装、质感等,用于后续图片生成)",
"selling_points": ["卖点1", "卖点2"],
"target_audience": "目标人群",
"platform": "投放平台",
"style": "视频风格必须明确如ASMR/剧情/视觉流等)",
"style_requirements": "该风格的具体创作要求如ASMR需要开盖声、质感特写、无人脸等",
"creativity_level": "创意程度",
"reference": "对标账号/竞品",
"user_assets_description": "用户上传素材的描述(用于后续继承)"
},
"creative_summary": "整体创意概述50字以内描述这个视频的核心创意方向",
"ready": true
}
"""
def refine_brief(
original_prompt: str,
analysis: Dict[str, Any],
answers: Dict[str, Any],
image_urls: List[str] = None
) -> Dict[str, Any]:
"""
Integrate user answers into a complete creative brief.
"""
logger.info("Brain: Refining brief with answers...")
user_content = f"""
原始需求: {original_prompt}
AI分析结果: {json.dumps(analysis, ensure_ascii=False)}
用户补充回答: {json.dumps(answers, ensure_ascii=False)}
用户上传的素材URL: {json.dumps(image_urls or [], ensure_ascii=False)}
"""
try:
# Use Text LLM for reasoning/refining if no new images involved
# But to keep it simple, we can stick to BRAIN_MODEL_ID (Doubao Pro)
response = client.chat.completions.create(
model=config.BRAIN_MODEL_ID,
messages=[
{"role": "system", "content": REFINE_SYSTEM_PROMPT},
{"role": "user", "content": user_content}
],
temperature=0.5,
max_tokens=3000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Refine Error: {e}")
raise
# ============================================================
# Stage 3: Generate Script
# ============================================================
SCRIPT_SYSTEM_PROMPT = """你是顶级短视频编导,专精{style}风格内容创作。
根据创意简报生成爆款脚本。必须严格遵循用户选择的风格要求。
脚本结构要求:
1. creative_summary: 整体创意概述(这条视频的核心创意是什么)
2. hook: 前3秒钩子设计必须抓眼球符合{style}风格)
3. scenes: 3-8个分镜
4. cta: 结尾行动号召(纯文本字符串)
每个分镜(scene)必须包含:
- id: 分镜编号
- duration: 时长(5/10/15秒符合视频模型参数)
- timeline: 时间轴 (如 "0:00-0:05")
- image_prompt: 【关键】用于AI生图的详细英文prompt必须包含
* 产品的具体视觉描述继承自brief中的product_visual_description
* 8k, hyper-realistic, cinematic lighting
* 色调、环境、构图、焦点
* 风格要求如ASMR需要macro shot, satisfying texture, no human face
- keyframe: {
"color_tone": "色调",
"environment": "环境/背景",
"foreground": "前景元素",
"focus": "视觉焦点",
"subject": "主体描述",
"composition": "构图方式"
}
- camera_movement: 运镜描述slow zoom in, pan left, static
- story_beat: 这个分镜在整体故事中的作用
- voiceover: 旁白文字({style}风格如ASMR应简短或无旁白用音效代替
- sound_design: 音效设计(如:开盖声、水滴声、环境白噪音)
- rhythm: {"change": "保持/加快/放慢", "multiplier": 1.0}
旁白要求:
- 必须连贯,形成完整的叙事
- 符合{style}风格ASMR风格应极简或无旁白
- 每句旁白要能独立成句,但连起来是完整故事
输出严格JSON格式。
"""
def generate_script(
brief: Dict[str, Any],
image_urls: List[str] = None,
regenerate_feedback: str = ""
) -> Dict[str, Any]:
"""
Generate complete video script with scenes.
"""
logger.info("Brain: Generating script...")
style = brief.get("style", "现代广告")
system_prompt = SCRIPT_SYSTEM_PROMPT.replace("{style}", style)
content_parts = [{"type": "text", "text": f"创意简报: {json.dumps(brief, ensure_ascii=False)}"}]
if regenerate_feedback:
content_parts.append({"type": "text", "text": f"\n用户反馈(请据此调整): {regenerate_feedback}"})
if image_urls:
content_parts.append({"type": "text", "text": "\n用户上传的参考素材生成的image_prompt必须参考这些素材中的产品外观:"})
for url in image_urls:
content_parts.append({
"type": "image_url",
"image_url": {"url": url}
})
try:
response = client.chat.completions.create(
model=config.VISION_MODEL_ID, # Use Vision model to see reference images if available
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content_parts}
],
temperature=0.8,
max_tokens=8000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Script Error: {e}")
raise
# ============================================================
# Stage 4: Regenerate Single Scene
# ============================================================
def regenerate_scene(
full_script: Dict[str, Any],
scene_id: int,
feedback: str,
brief: Dict[str, Any] = None
) -> Dict[str, Any]:
"""
Regenerate a single scene based on feedback.
"""
logger.info(f"Brain: Regenerating scene {scene_id}...")
style = brief.get("style", "现代广告") if brief else "现代广告"
system_prompt = f"""你是短视频编导,专精{style}风格。根据用户反馈重新生成指定分镜。
保持与其他分镜的风格连贯性。
image_prompt必须继承产品的视觉描述。
只输出新的scene对象(JSON)。
"""
user_content = f"""
完整脚本: {json.dumps(full_script, ensure_ascii=False)}
创意简报: {json.dumps(brief, ensure_ascii=False) if brief else ""}
需要重新生成的分镜ID: {scene_id}
用户反馈: {feedback}
"""
try:
response = client.chat.completions.create(
model=config.BRAIN_MODEL_ID,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content}
],
temperature=0.8,
max_tokens=2000
)
content = response.choices[0].message.content.strip()
if content.startswith("```"):
parts = content.split("```")
if len(parts) > 1:
content = parts[1]
if content.startswith("json"): content = content[4:]
return json.loads(content)
except Exception as e:
logger.error(f"Brain Regenerate Scene Error: {e}")
raise

717
modules/composer.py Normal file
View File

@@ -0,0 +1,717 @@
"""
视频合成器模块
整合视频拼接、花字叠加、旁白配音的完整流程
"""
import os
import time
import logging
from pathlib import Path
from typing import Dict, Any, List, Optional, Union
import config
from modules import ffmpeg_utils, fancy_text, factory, storage
from modules.text_renderer import renderer
logger = logging.getLogger(__name__)
class VideoComposer:
"""视频合成器"""
def __init__(
self,
output_dir: str = None,
target_size: tuple = (1080, 1920),
voice_type: str = "sweet_female"
):
"""
初始化合成器
Args:
output_dir: 输出目录
target_size: 目标分辨率 (width, height)
voice_type: 默认旁白音色
"""
self.output_dir = Path(output_dir) if output_dir else config.OUTPUT_DIR
self.output_dir.mkdir(exist_ok=True)
self.target_size = target_size
self.voice_type = voice_type
# 临时文件追踪
self._temp_files = []
def _add_temp(self, path: str):
"""记录临时文件"""
if path:
self._temp_files.append(path)
def cleanup(self):
"""清理临时文件"""
for f in self._temp_files:
try:
if os.path.exists(f):
os.remove(f)
except Exception as e:
logger.warning(f"Failed to cleanup {f}: {e}")
self._temp_files = []
def compose(
self,
video_paths: List[str],
subtitles: List[Dict[str, Any]] = None,
fancy_texts: List[Dict[str, Any]] = None,
voiceover_text: str = None,
voiceover_segments: List[Dict[str, Any]] = None,
bgm_path: str = None,
bgm_volume: float = 0.15,
output_name: str = None,
upload_to_r2: bool = False
) -> str:
"""
完整视频合成流程
Args:
video_paths: 分镜视频路径列表
subtitles: 字幕配置列表 [{text, start, duration, style}]
fancy_texts: 花字配置列表 [{text, style, x, y, start, duration}]
voiceover_text: 完整旁白文本(会自动生成并混音)
voiceover_segments: 分段旁白配置 [{text, start}],与 voiceover_text 二选一
bgm_path: 背景音乐路径
bgm_volume: BGM音量
output_name: 输出文件名(不含扩展名)
upload_to_r2: 是否上传到R2存储
Returns:
最终视频路径或R2 URL
"""
if not video_paths:
raise ValueError("No video paths provided")
timestamp = int(time.time())
output_name = output_name or f"composed_{timestamp}"
logger.info(f"Starting composition: {len(video_paths)} videos")
try:
# Step 1: 拼接视频
merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
self._add_temp(merged_path)
current_video = merged_path
# Step 1.1: 若无音轨,补一条静音底,避免后续滤镜找不到 0:a
silent_path = str(config.TEMP_DIR / f"{output_name}_silent.mp4")
ffmpeg_utils.add_silence_audio(current_video, silent_path)
self._add_temp(silent_path)
current_video = silent_path
# Step 2: 添加字幕 (白字黑边,无底框,下半区域居中)
if subtitles:
subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
subtitle_style = {
"font": ffmpeg_utils._get_font_path(),
"fontsize": 60,
"fontcolor": "white",
"borderw": 5,
"bordercolor": "black",
"box": 0, # 无底框
"y": "h-200", # 下半区域居中
}
ffmpeg_utils.add_multiple_subtitles(
current_video, subtitles, subtitled_path, default_style=subtitle_style
)
self._add_temp(subtitled_path)
current_video = subtitled_path
# Step 3: 叠加花字 (支持原子化参数)
if fancy_texts:
overlay_configs = []
for ft in fancy_texts:
text = ft.get("text", "")
style = ft.get("style")
custom_style = ft.get("custom_style")
# 如果 style 是字典,说明是原子化参数,直接使用
if isinstance(style, dict):
img_path = renderer.render(text, style, cache=False)
elif custom_style and isinstance(custom_style, dict):
# 兼容旧逻辑:如果有 custom_style尝试通过原子化渲染器渲染
if "font_size" in custom_style:
img_path = renderer.render(text, custom_style, cache=False)
else:
# 回退到旧版 fancy_text
img_path = fancy_text.create_fancy_text(
text=text,
style=style if isinstance(style, str) else "subtitle",
custom_style={
**(custom_style or {}),
"font_name": "/System/Library/Fonts/PingFang.ttc",
},
cache=False
)
else:
# 旧版逻辑
img_path = fancy_text.create_fancy_text(
text=text,
style=style if isinstance(style, str) else "subtitle",
custom_style={
"font_name": "/System/Library/Fonts/PingFang.ttc",
},
cache=False
)
overlay_configs.append({
"path": img_path,
"x": ft.get("x", "(W-w)/2"),
"y": ft.get("y", "(H-h)/2"),
"start": ft.get("start", 0),
"duration": ft.get("duration", 999)
})
fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
ffmpeg_utils.overlay_multiple_images(
current_video, overlay_configs, fancy_path
)
self._add_temp(fancy_path)
current_video = fancy_path
# Step 4: 生成并混合旁白(火山 WS 优先,失败回退 Edge
if voiceover_text:
vo_path = factory.generate_voiceover_volcengine(
text=voiceover_text,
voice_type=self.voice_type
)
self._add_temp(vo_path)
voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
ffmpeg_utils.mix_audio(
current_video, vo_path, voiced_path,
audio_volume=1.5,
video_volume=0.2
)
self._add_temp(voiced_path)
current_video = voiced_path
elif voiceover_segments:
current_video = self._add_segmented_voiceover(
current_video, voiceover_segments, output_name
)
# Step 5: 添加BGM淡入淡出若 duck 失败会自动退回低音量混合)
if bgm_path:
bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
ffmpeg_utils.add_bgm(
current_video, bgm_path, bgm_output,
bgm_volume=bgm_volume,
ducking=False, # 为避免兼容性问题,这里禁用 duck保持低音量
duck_gain_db=-6.0,
fade_in=1.0,
fade_out=1.0
)
self._add_temp(bgm_output)
current_video = bgm_output
# Step 6: 输出最终文件
final_path = str(self.output_dir / f"{output_name}.mp4")
# 复制到输出目录
import shutil
shutil.copy(current_video, final_path)
logger.info(f"Composition complete: {final_path}")
# 上传到R2
if upload_to_r2:
r2_url = storage.upload_file(final_path)
logger.info(f"Uploaded to R2: {r2_url}")
return r2_url
return final_path
finally:
# 清理临时文件(保留最终输出)
self.cleanup()
def _add_segmented_voiceover(
self,
video_path: str,
segments: List[Dict[str, Any]],
output_name: str
) -> str:
"""添加分段旁白"""
if not segments:
return video_path
# 为每段生成音频
audio_files = []
for i, seg in enumerate(segments):
text = seg.get("text", "")
if not text:
continue
voice = seg.get("voice_type", self.voice_type)
audio_path = factory.generate_voiceover_volcengine(
text=text,
voice_type=voice,
output_path=str(config.TEMP_DIR / f"{output_name}_seg_{i}.mp3")
)
if audio_path:
audio_files.append({
"path": audio_path,
"start": seg.get("start", 0)
})
self._add_temp(audio_path)
if not audio_files:
return video_path
# 依次混入音频
current = video_path
for i, af in enumerate(audio_files):
output = str(config.TEMP_DIR / f"{output_name}_seg_mixed_{i}.mp4")
ffmpeg_utils.mix_audio(
current, af["path"], output,
audio_volume=1.0,
video_volume=0.2 if i == 0 else 1.0, # 只在第一次降低原视频音量
audio_start=af["start"]
)
self._add_temp(output)
current = output
return current
def compose_from_script(
self,
script: Dict[str, Any],
video_map: Dict[int, str],
bgm_path: str = None,
output_name: str = None
) -> str:
"""
基于生成脚本和视频映射进行合成
Args:
script: 标准化分镜脚本
video_map: 场景ID到视频路径的映射
bgm_path: BGM路径
output_name: 输出文件名
"""
scenes = script.get("scenes", [])
if not scenes:
raise ValueError("Empty script")
video_paths = []
fancy_texts = []
# 1. 收集视频路径和花字 (按分镜顺序)
total_duration = 0.0
for scene in scenes:
scene_id = scene["id"]
video_path = video_map.get(scene_id)
if not video_path or not os.path.exists(video_path):
logger.warning(f"Missing video for scene {scene_id}, skipping")
continue
# 获取实际视频时长
try:
info = ffmpeg_utils.get_video_info(video_path)
duration = float(info.get("duration", 5.0))
except:
duration = 5.0
video_paths.append(video_path)
# 花字 (白字黑边,无底框,固定在上半区域居中)
if "fancy_text" in scene:
ft = scene["fancy_text"]
if isinstance(ft, dict):
text = ft.get("text", "")
if text:
# 固定样式:白字黑边,无底框
fixed_style = {
"font_size": 72,
"font_color": "#FFFFFF",
"stroke": {"color": "#000000", "width": 5}
# 无 background不加底框
}
fancy_texts.append({
"text": text,
"style": fixed_style,
"x": "(W-w)/2", # 居中
"y": "180", # 上半区域
"start": total_duration + float(ft.get("start_time", 0)),
"duration": float(ft.get("duration", duration))
})
total_duration += duration
# 2. 拼接视频
timestamp = int(time.time())
output_name = output_name or f"composed_{timestamp}"
merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
self._add_temp(merged_path)
current_video = merged_path
# 3. 处理整体旁白时间轴 (New Logic)
voiceover_timeline = script.get("voiceover_timeline", [])
mixed_audio_path = str(config.TEMP_DIR / f"{output_name}_mixed_vo.mp3")
# 初始化静音底轨 (长度为 total_duration)
ffmpeg_utils._run_ffmpeg([
ffmpeg_utils.FFMPEG_PATH, "-y",
"-f", "lavfi", "-i", "anullsrc=r=44100:cl=stereo",
"-t", str(total_duration),
"-c:a", "mp3",
mixed_audio_path
])
self._add_temp(mixed_audio_path)
subtitles = []
if voiceover_timeline:
for i, item in enumerate(voiceover_timeline):
text = item.get("text", "")
sub_text = item.get("subtitle", text)
# 支持两种格式:
# 新格式: start_time (秒), duration (秒) - 直接使用绝对时间
# 旧格式: start_ratio (0-1), duration_ratio (0-1) - 按比例计算
if "start_time" in item:
# 新格式:直接使用秒
target_start = float(item.get("start_time", 0))
target_duration = float(item.get("duration", 3))
else:
# 旧格式:按比例计算(向后兼容)
start_ratio = float(item.get("start_ratio", 0))
duration_ratio = float(item.get("duration_ratio", 0))
target_start = start_ratio * total_duration
target_duration = duration_ratio * total_duration
if not text: continue
# 生成 TTS
tts_path = factory.generate_voiceover_volcengine(
text=text,
voice_type=self.voice_type,
output_path=str(config.TEMP_DIR / f"{output_name}_vo_{i}.mp3")
)
self._add_temp(tts_path)
# 调整时长
adjusted_path = str(config.TEMP_DIR / f"{output_name}_vo_adj_{i}.mp3")
ffmpeg_utils.adjust_audio_duration(tts_path, target_duration, adjusted_path)
self._add_temp(adjusted_path)
# 混合到总音轨
new_mixed = str(config.TEMP_DIR / f"{output_name}_mixed_{i}.mp3")
ffmpeg_utils.mix_audio_at_offset(mixed_audio_path, adjusted_path, target_start, new_mixed)
mixed_audio_path = new_mixed # Update current mixed path
self._add_temp(new_mixed)
# 添加字幕配置 (完全同步)
subtitles.append({
"text": ffmpeg_utils.wrap_text_smart(sub_text),
"start": target_start,
"duration": target_duration,
"style": {} # Default
})
# 4. 将合成好的旁白混入视频
voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
ffmpeg_utils.mix_audio(
current_video, mixed_audio_path, voiced_path,
audio_volume=1.5,
video_volume=0.2 # 压低原音
)
self._add_temp(voiced_path)
current_video = voiced_path
# 5. 添加字幕 (使用新的 ffmpeg_utils.add_multiple_subtitles)
if subtitles:
subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
subtitle_style = {
"font": ffmpeg_utils._get_font_path(),
"fontsize": 60,
"fontcolor": "white",
"borderw": 5,
"bordercolor": "black",
"box": 0, # 无底框
"y": "h-200", # 下半区域居中
}
ffmpeg_utils.add_multiple_subtitles(
current_video, subtitles, subtitled_path, default_style=subtitle_style
)
self._add_temp(subtitled_path)
current_video = subtitled_path
# 6. 添加花字
if fancy_texts:
fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
overlay_configs = []
for ft in fancy_texts:
# 渲染花字图片
img_path = renderer.render(ft["text"], ft["style"], cache=False)
overlay_configs.append({
"path": img_path,
"x": ft["x"],
"y": ft["y"],
"start": ft["start"],
"duration": ft["duration"]
})
ffmpeg_utils.overlay_multiple_images(
current_video, overlay_configs, fancy_path
)
self._add_temp(fancy_path)
current_video = fancy_path
# 7. 添加 BGM
if bgm_path:
bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
ffmpeg_utils.add_bgm(
current_video, bgm_path, bgm_output,
bgm_volume=0.15
)
self._add_temp(bgm_output)
current_video = bgm_output
# 8. 输出最终文件
final_path = str(self.output_dir / f"{output_name}.mp4")
import shutil
shutil.copy(current_video, final_path)
logger.info(f"Composition complete: {final_path}")
self.cleanup()
return final_path
def compose_standard_task(self, task_config: Dict[str, Any]) -> str:
"""
执行标准合成任务 (Legacy)
"""
settings = task_config.get("settings", {})
self.voice_type = settings.get("voice_type", self.voice_type)
# 1. 准备视频片段
video_paths = []
for seg in task_config.get("segments", []):
path = seg.get("path") or seg.get("video_path")
if not path: continue
video_paths.append(path)
# 2. 解析时间轴
subtitles = []
fancy_texts = []
voiceover_segments = []
for item in task_config.get("timeline", []):
itype = item.get("type")
if not itype:
if "text" in item and ("style" in item or "x" in item or "y" in item):
itype = "fancy_text"
elif "text" in item and "duration" in item and "start" in item:
itype = "subtitle"
elif "text" in item and "start" in item:
itype = "voiceover"
else:
continue
if itype == "subtitle":
subtitles.append(item)
elif itype == "fancy_text":
if "x" not in item and "position" in item:
item["x"] = item["position"].get("x")
item["y"] = item["position"].get("y")
fancy_texts.append(item)
elif itype == "voiceover":
voiceover_segments.append(item)
return self.compose(
video_paths=video_paths,
subtitles=subtitles,
fancy_texts=fancy_texts,
voiceover_segments=voiceover_segments,
bgm_path=settings.get("bgm_path"),
bgm_volume=settings.get("bgm_volume", 0.06),
output_name=settings.get("output_name"),
upload_to_r2=settings.get("upload_to_r2", False)
)
def compose_product_video(
video_paths: List[str],
subtitle_configs: List[Dict[str, Any]] = None,
fancy_text_configs: List[Dict[str, Any]] = None,
voiceover_text: str = None,
bgm_path: str = None,
output_path: str = None,
voice_type: str = "sweet_female"
) -> str:
"""便捷函数:合成商品短视频"""
composer = VideoComposer(voice_type=voice_type)
output_name = None
if output_path:
output_name = Path(output_path).stem
composer.output_dir = Path(output_path).parent
return composer.compose(
video_paths=video_paths,
subtitles=subtitle_configs,
fancy_texts=fancy_text_configs,
voiceover_text=voiceover_text,
bgm_path=bgm_path,
output_name=output_name
)
def quick_compose(
video_folder: str,
script: List[Dict[str, Any]],
output_path: str = None,
voice_type: str = "sweet_female",
bgm_path: str = None
) -> str:
"""快速合成:从文件夹读取视频,配合脚本合成"""
folder = Path(video_folder)
video_files = sorted([
f for f in folder.iterdir()
if f.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv']
])
video_paths = []
subtitles = []
fancy_texts = []
voiceovers = []
current_time = 0
for i, item in enumerate(script):
if "video" in item:
vp = folder / item["video"]
elif i < len(video_files):
vp = video_files[i]
else:
logger.warning(f"No video for script item {i}")
continue
video_paths.append(str(vp))
try:
info = ffmpeg_utils.get_video_info(str(vp))
duration = info.get("duration", 5)
except:
duration = item.get("duration", 5)
if "subtitle" in item:
subtitles.append({
"text": item["subtitle"],
"start": current_time,
"duration": duration,
"style": item.get("subtitle_style", {})
})
if "fancy_text" in item:
ft = item["fancy_text"]
if isinstance(ft, str):
ft = {"text": ft}
fancy_texts.append({
"text": ft.get("text", ""),
"style": ft.get("style", "highlight"),
"custom_style": ft.get("custom_style"),
"x": ft.get("x", "(W-w)/2"),
"y": ft.get("y", 200),
"start": current_time,
"duration": duration
})
if "voiceover" in item:
voiceovers.append(item["voiceover"])
current_time += duration
voiceover_text = "".join(voiceovers) if voiceovers else None
return compose_product_video(
video_paths=video_paths,
subtitle_configs=subtitles if subtitles else None,
fancy_text_configs=fancy_texts if fancy_texts else None,
voiceover_text=voiceover_text,
bgm_path=bgm_path,
output_path=output_path,
voice_type=voice_type
)
# ============================================================
# 示例用法
# ============================================================
def example_hairclip_video():
"""示例:发夹商品视频合成"""
素材目录 = Path("/Volumes/Tony/video-flow/素材/发夹/合成图拆分镜")
video_paths = [
str(素材目录 / "视频-分镜1.mp4"),
str(素材目录 / "视频-分镜2.mp4"),
str(素材目录 / "视频-分镜3.mp4"),
str(素材目录 / "视频-分镜4.mp4"),
str(素材目录 / "视频-分镜5.mp4"),
]
script = [
{
"subtitle": "塌马尾 vs 高颅顶",
"fancy_text": {
"text": "塌马尾 vs 高颅顶",
"style": "comparison",
"y": 150
},
"voiceover": "普通马尾和高颅顶马尾的区别,你看出来了吗",
},
{
"subtitle": "3秒出门无需皮筋",
"fancy_text": {"text": "发量+50%", "style": "bubble", "y": 300},
"voiceover": "只需要三秒钟,不需要皮筋,发量瞬间增加百分之五十",
},
{
"subtitle": "发量+50%",
"voiceover": "蓬松的高颅顶效果,让你瞬间变美",
},
{
"subtitle": "狂甩不掉!",
"fancy_text": {"text": "狂甩不掉!", "style": "warning", "y": 400},
"voiceover": "而且超级牢固,怎么甩都不会掉",
},
{
"subtitle": "¥3.99 立即抢购",
"fancy_text": {"text": "3.99", "style": "price", "y": 500},
"voiceover": "只要三块九毛九,点击下方链接立即购买",
},
]
output = quick_compose(
video_folder=str(素材目录),
script=script,
output_path="/Volumes/Tony/video-flow/output/发夹_合成视频.mp4",
voice_type="sweet_female"
)
print(f"视频合成完成: {output}")
return output
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
example_hairclip_video()

305
modules/db_manager.py Normal file
View File

@@ -0,0 +1,305 @@
"""
数据库管理模块 (SQLAlchemy)
负责项目数据、任务状态、素材路径的持久化存储
支持 SQLite 和 PostgreSQL
"""
import json
import logging
import time
from typing import Dict, List, Any, Optional
from sqlalchemy import create_engine, Column, String, Integer, Text, Float, UniqueConstraint, func
from sqlalchemy.orm import sessionmaker, scoped_session, declarative_base
from sqlalchemy.dialects.postgresql import JSONB
import config
logger = logging.getLogger(__name__)
Base = declarative_base()
class Project(Base):
__tablename__ = 'projects'
id = Column(String, primary_key=True)
name = Column(String)
status = Column(String) # created, script_generated, images_generated, videos_generated, completed
product_info = Column(Text) # JSON string (SQLite) or JSONB (PG - using Text for compat)
script_data = Column(Text) # JSON string
created_at = Column(Float, default=time.time)
updated_at = Column(Float, default=time.time, onupdate=time.time)
class SceneAsset(Base):
__tablename__ = 'scene_assets'
id = Column(Integer, primary_key=True, autoincrement=True)
project_id = Column(String, index=True)
scene_id = Column(Integer)
asset_type = Column(String) # image, video
status = Column(String) # pending, processing, completed, failed
local_path = Column(Text, nullable=True)
remote_url = Column(Text, nullable=True)
task_id = Column(String, nullable=True) # 外部 API 的任务 ID
metadata_json = Column("metadata", Text, nullable=True) # JSON string (renamed to avoid conflict with metadata attr)
created_at = Column(Float, default=time.time)
updated_at = Column(Float, default=time.time, onupdate=time.time)
__table_args__ = (UniqueConstraint('project_id', 'scene_id', 'asset_type', name='uix_project_scene_asset'),)
class AppConfig(Base):
__tablename__ = 'app_config'
key = Column(String, primary_key=True)
value = Column(Text) # JSON string
description = Column(Text, nullable=True)
updated_at = Column(Float, default=time.time, onupdate=time.time)
class DBManager:
def __init__(self, connection_string: str = None):
if not connection_string:
connection_string = config.DB_CONNECTION_STRING
self.engine = create_engine(connection_string, pool_recycle=3600)
self.Session = scoped_session(sessionmaker(bind=self.engine))
self._init_db()
def _init_db(self):
"""初始化表结构"""
Base.metadata.create_all(self.engine)
def _get_session(self):
return self.Session()
# --- Project Operations ---
def create_project(self, project_id: str, name: str, product_info: Dict[str, Any]):
session = self._get_session()
try:
# Check if exists
existing = session.query(Project).filter_by(id=project_id).first()
if existing:
logger.warning(f"Project {project_id} already exists.")
return
new_project = Project(
id=project_id,
name=name,
status="created",
product_info=json.dumps(product_info, ensure_ascii=False),
created_at=time.time(),
updated_at=time.time()
)
session.add(new_project)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error creating project: {e}")
raise
finally:
session.close()
def update_project_script(self, project_id: str, script: Dict[str, Any]):
session = self._get_session()
try:
project = session.query(Project).filter_by(id=project_id).first()
if project:
project.script_data = json.dumps(script, ensure_ascii=False)
project.status = "script_generated"
project.updated_at = time.time()
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error updating script: {e}")
finally:
session.close()
def update_project_status(self, project_id: str, status: str):
session = self._get_session()
try:
project = session.query(Project).filter_by(id=project_id).first()
if project:
project.status = status
project.updated_at = time.time()
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error updating status: {e}")
finally:
session.close()
def get_project(self, project_id: str) -> Optional[Dict[str, Any]]:
session = self._get_session()
try:
project = session.query(Project).filter_by(id=project_id).first()
if project:
data = {
"id": project.id,
"name": project.name,
"status": project.status,
"product_info": json.loads(project.product_info) if project.product_info else {},
"script_data": json.loads(project.script_data) if project.script_data else None,
"created_at": project.created_at,
"updated_at": project.updated_at
}
return data
return None
finally:
session.close()
def list_projects(self) -> List[Dict[str, Any]]:
session = self._get_session()
try:
projects = session.query(Project).order_by(Project.updated_at.desc()).all()
results = []
for p in projects:
results.append({
"id": p.id,
"name": p.name,
"status": p.status,
"updated_at": p.updated_at
})
return results
finally:
session.close()
# --- Asset/Task Operations ---
def save_asset(self, project_id: str, scene_id: int, asset_type: str,
status: str, local_path: str = None, remote_url: str = None,
task_id: str = None, metadata: Dict = None):
"""保存或更新资产记录 (UPSERT 逻辑)"""
session = self._get_session()
try:
asset = session.query(SceneAsset).filter_by(
project_id=project_id,
scene_id=scene_id,
asset_type=asset_type
).first()
meta_json = json.dumps(metadata, ensure_ascii=False) if metadata else "{}"
if asset:
asset.status = status
asset.local_path = local_path
asset.remote_url = remote_url
asset.task_id = task_id
asset.metadata_json = meta_json
asset.updated_at = time.time()
else:
new_asset = SceneAsset(
project_id=project_id,
scene_id=scene_id,
asset_type=asset_type,
status=status,
local_path=local_path,
remote_url=remote_url,
task_id=task_id,
metadata_json=meta_json,
created_at=time.time(),
updated_at=time.time()
)
session.add(new_asset)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error saving asset: {e}")
finally:
session.close()
def get_assets(self, project_id: str, asset_type: str = None) -> List[Dict[str, Any]]:
session = self._get_session()
try:
query = session.query(SceneAsset).filter_by(project_id=project_id)
if asset_type:
query = query.filter_by(asset_type=asset_type)
assets = query.all()
results = []
for a in assets:
data = {
"id": a.id,
"project_id": a.project_id,
"scene_id": a.scene_id,
"asset_type": a.asset_type,
"status": a.status,
"local_path": a.local_path,
"remote_url": a.remote_url,
"task_id": a.task_id,
"metadata": json.loads(a.metadata_json) if a.metadata_json else {},
"updated_at": a.updated_at
}
results.append(data)
return results
finally:
session.close()
def get_asset(self, project_id: str, scene_id: int, asset_type: str) -> Optional[Dict[str, Any]]:
session = self._get_session()
try:
a = session.query(SceneAsset).filter_by(
project_id=project_id,
scene_id=scene_id,
asset_type=asset_type
).first()
if a:
return {
"id": a.id,
"project_id": a.project_id,
"scene_id": a.scene_id,
"asset_type": a.asset_type,
"status": a.status,
"local_path": a.local_path,
"remote_url": a.remote_url,
"task_id": a.task_id,
"metadata": json.loads(a.metadata_json) if a.metadata_json else {},
"updated_at": a.updated_at
}
return None
finally:
session.close()
# --- Config/Prompt Operations ---
def get_config(self, key: str, default: Any = None) -> Any:
session = self._get_session()
try:
cfg = session.query(AppConfig).filter_by(key=key).first()
if cfg:
try:
return json.loads(cfg.value)
except:
return cfg.value
return default
finally:
session.close()
def set_config(self, key: str, value: Any, description: str = None):
session = self._get_session()
try:
json_val = json.dumps(value, ensure_ascii=False)
cfg = session.query(AppConfig).filter_by(key=key).first()
if cfg:
cfg.value = json_val
if description:
cfg.description = description
cfg.updated_at = time.time()
else:
new_cfg = AppConfig(
key=key,
value=json_val,
description=description,
updated_at=time.time()
)
session.add(new_cfg)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error setting config: {e}")
finally:
session.close()
# Singleton instance
db = DBManager()

269
modules/editor.py Normal file
View File

@@ -0,0 +1,269 @@
"""
MatchMe Studio - Editor Module (Assembly + BGM)
"""
import logging
import requests
from pathlib import Path
from typing import Dict, Any, List, Optional
from moviepy.editor import (
VideoFileClip, AudioFileClip, TextClip,
CompositeVideoClip, CompositeAudioClip,
concatenate_videoclips
)
import config
from modules import storage
logger = logging.getLogger(__name__)
# ============================================================
# Video Assembly
# ============================================================
def download_video(url: str) -> str:
"""Download video from URL to temp."""
filename = f"dl_{Path(url).name}"
local_path = config.TEMP_DIR / filename
with open(local_path, "wb") as f:
f.write(requests.get(url).content)
return str(local_path)
def concatenate_scenes(video_urls: List[str]) -> str:
"""Concatenate multiple video clips into one."""
logger.info(f"Concatenating {len(video_urls)} clips...")
clips = []
for url in video_urls:
local_path = download_video(url)
clip = VideoFileClip(local_path)
# Resize to 9:16 if needed
if clip.w != 1080 or clip.h != 1920:
clip = clip.resize(newsize=(1080, 1920))
clips.append(clip)
final = concatenate_videoclips(clips, method="compose")
output_path = config.TEMP_DIR / f"merged_{int(__import__('time').time())}.mp4"
final.write_videofile(
str(output_path),
fps=30,
codec="libx264",
audio_codec="aac",
threads=4,
logger=None
)
# Cleanup
for clip in clips:
clip.close()
final.close()
return str(output_path)
# ============================================================
# Subtitle Burning
# ============================================================
def burn_subtitles(
video_path: str,
scenes: List[Dict[str, Any]]
) -> str:
"""Burn subtitles onto video."""
logger.info("Burning subtitles...")
clip = VideoFileClip(video_path)
subtitle_clips = []
current_time = 0
for scene in scenes:
voiceover = scene.get("voiceover", "")
duration = scene.get("duration", 5)
if voiceover:
try:
txt = TextClip(
voiceover,
fontsize=48,
color='white',
stroke_color='black',
stroke_width=2,
font='DejaVu-Sans',
method='caption',
size=(900, None)
).set_position(('center', 1600)).set_start(current_time).set_duration(duration)
subtitle_clips.append(txt)
except Exception as e:
logger.warning(f"Subtitle error: {e}")
current_time += duration
if subtitle_clips:
final = CompositeVideoClip([clip] + subtitle_clips)
else:
final = clip
output_path = config.TEMP_DIR / f"subtitled_{int(__import__('time').time())}.mp4"
final.write_videofile(
str(output_path),
fps=30,
codec="libx264",
audio_codec="aac",
threads=4,
logger=None
)
clip.close()
final.close()
return str(output_path)
# ============================================================
# Voiceover Mixing
# ============================================================
def mix_voiceover(video_path: str, voiceover_url: str) -> str:
"""Mix voiceover audio with video."""
if not voiceover_url:
return video_path
logger.info("Mixing voiceover...")
# Download voiceover
vo_local = download_video(voiceover_url)
video = VideoFileClip(video_path)
voiceover = AudioFileClip(vo_local)
# Trim voiceover if longer than video
if voiceover.duration > video.duration:
voiceover = voiceover.subclip(0, video.duration)
# Mix with original audio (if any)
if video.audio:
mixed = CompositeAudioClip([
video.audio.volumex(0.3), # Lower original
voiceover.volumex(1.0)
])
else:
mixed = voiceover
final = video.set_audio(mixed)
output_path = config.TEMP_DIR / f"voiced_{int(__import__('time').time())}.mp4"
final.write_videofile(
str(output_path),
fps=30,
codec="libx264",
audio_codec="aac",
threads=4,
logger=None
)
video.close()
voiceover.close()
final.close()
return str(output_path)
# ============================================================
# BGM Mixing
# ============================================================
def mix_bgm(
video_path: str,
bgm_path: str,
bgm_volume: float = 0.2
) -> str:
"""Mix background music with video."""
logger.info("Mixing BGM...")
video = VideoFileClip(video_path)
bgm = AudioFileClip(bgm_path)
# Loop BGM if shorter than video
if bgm.duration < video.duration:
loops_needed = int(video.duration / bgm.duration) + 1
bgm = bgm.loop(loops_needed)
# Trim to video length
bgm = bgm.subclip(0, video.duration).volumex(bgm_volume)
# Mix with existing audio
if video.audio:
mixed = CompositeAudioClip([video.audio, bgm])
else:
mixed = bgm
final = video.set_audio(mixed)
output_path = config.TEMP_DIR / f"bgm_{int(__import__('time').time())}.mp4"
final.write_videofile(
str(output_path),
fps=30,
codec="libx264",
audio_codec="aac",
threads=4,
logger=None
)
video.close()
bgm.close()
final.close()
return str(output_path)
# ============================================================
# Full Pipeline
# ============================================================
def assemble_final_video(
video_urls: List[str],
scenes: List[Dict[str, Any]],
voiceover_url: str = "",
bgm_url: str = ""
) -> str:
"""
Full assembly pipeline:
1. Concatenate scene videos
2. Burn subtitles
3. Mix voiceover
4. Mix BGM
5. Upload to R2
"""
logger.info("Starting full assembly...")
# Step 1: Concatenate
merged = concatenate_scenes(video_urls)
# Step 2: Subtitles
subtitled = burn_subtitles(merged, scenes)
# Step 3: Voiceover
if voiceover_url:
voiced = mix_voiceover(subtitled, voiceover_url)
else:
voiced = subtitled
# Step 4: BGM
if bgm_url:
bgm_local = download_video(bgm_url)
final_path = mix_bgm(voiced, bgm_local)
else:
final_path = voiced
# Step 5: Upload
final_url = storage.upload_file(final_path)
logger.info(f"Final video uploaded: {final_url}")
return final_url

157
modules/export_utils.py Normal file
View File

@@ -0,0 +1,157 @@
import os
import zipfile
import logging
import shutil
import math
from pathlib import Path
from typing import List, Dict, Any
import config
logger = logging.getLogger(__name__)
def format_timestamp(seconds: float) -> str:
"""Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds - int(seconds)) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
def generate_srt(script_data: Dict[str, Any], video_map: Dict[int, str]) -> str:
"""Generate SRT content from script data"""
scenes = script_data.get("scenes", [])
srt_content = ""
current_time = 0.0
# Need to get durations from actual videos if possible, else estimate
from modules import ffmpeg_utils
for i, scene in enumerate(scenes):
scene_id = scene["id"]
# Get duration
duration = 5.0
if scene_id in video_map and os.path.exists(video_map[scene_id]):
try:
info = ffmpeg_utils.get_video_info(video_map[scene_id])
duration = info.get("duration", 5.0)
except:
pass
start_time = current_time
end_time = current_time + duration
current_time = end_time
text = scene.get("subtitle", "")
if text:
srt_content += f"{i+1}\n"
srt_content += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
srt_content += f"{text}\n\n"
return srt_content
def create_capcut_package(project_id: str, script_data: Dict[str, Any], assets: Dict[str, str]) -> str:
"""
Create a ZIP package for CapCut (JianYing) import
Contains:
- videos/ (scene videos)
- audios/ (voiceover, bgm)
- images/ (fancy text transparent pngs)
- subtitles.srt
"""
package_dir = config.TEMP_DIR / f"capcut_pkg_{project_id}_{int(os.getpid())}"
if package_dir.exists():
shutil.rmtree(package_dir)
package_dir.mkdir()
(package_dir / "videos").mkdir()
(package_dir / "audios").mkdir()
(package_dir / "images").mkdir()
# 1. Generate SRT
# Need to reconstruct video map from assets or script
# Assuming 'assets' contains 'scene_videos' map
scene_videos = assets.get("scene_videos", {})
srt_content = generate_srt(script_data, scene_videos)
with open(package_dir / "subtitles.srt", "w", encoding="utf-8") as f:
f.write(srt_content)
# 2. Copy Videos
scenes = script_data.get("scenes", [])
for i, scene in enumerate(scenes):
sid = scene["id"]
if sid in scene_videos and os.path.exists(scene_videos[sid]):
# Rename with sequence number for easy sorting: 01_scene.mp4
ext = Path(scene_videos[sid]).suffix
dest_name = f"{i+1:02d}_scene_{sid}{ext}"
shutil.copy(scene_videos[sid], package_dir / "videos" / dest_name)
# 3. Copy Audio (Voiceover)
# We might not have the separate voiceover file easily accessible if it was mixed on the fly.
# But usually we generate it to temp.
# Option: Re-generate voiceover audio for the whole track or segments?
# Better: If we have 'voiceover_segments', generate them or copy if cached.
# For now, let's try to find if we have a full voiceover file or just use segments.
# Simplest: Re-generate the full voiceover audio file if it doesn't exist as a standalone asset.
# Or check if user just wants the pieces.
# Let's check if we have a mixed audio file. Usually we don't save the intermediate audio as an asset.
# So we might need to re-generate the voiceover audio here.
from modules import factory
full_vo_text = " ".join([s.get("voiceover", "") for s in scenes if s.get("voiceover")])
if full_vo_text:
try:
# Assuming default voice
voice_type = config.VOLC_TTS_DEFAULT_VOICE
vo_path = factory.generate_voiceover_volcengine(full_vo_text, voice_type)
shutil.copy(vo_path, package_dir / "audios" / "full_voiceover.mp3")
except Exception as e:
logger.warning(f"Failed to generate export voiceover: {e}")
# Copy BGM
# Check settings or script for BGM? BGM is usually a global setting in Composer.
# We'll just look for BGM in assets folder or let user drag their own.
# Or if we saved the BGM selection in the project, we could copy it.
# For now, skip specific BGM unless we know which one was used.
# 4. Copy Fancy Text Images
# We need to re-render them or find them.
# Since they are generated to temp in composer, they might be gone.
# Safer to re-render them.
from modules.text_renderer import renderer
for i, scene in enumerate(scenes):
ft = scene.get("fancy_text")
if ft:
text = ft.get("text", "") if isinstance(ft, dict) else ""
style = ft.get("style", "highlight") if isinstance(ft, dict) else "highlight"
if text:
try:
# Render
if isinstance(style, str):
# Simple mapping or default
# We need the full style dict logic from composer ideally
# For export, we just use default render
pass
# Actually, composer logic for style resolution is complex.
# Let's just use a simple render here.
img_path = renderer.render(text, {"font_size": 60, "font_color": "#FFFFFF"}, cache=False)
shutil.copy(img_path, package_dir / "images" / f"{i+1:02d}_text_{scene['id']}.png")
except:
pass
# 5. Zip it
zip_path = config.TEMP_DIR / f"capcut_export_{project_id}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(package_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, package_dir)
zipf.write(file_path, arcname)
# Cleanup
shutil.rmtree(package_dir)
return str(zip_path)

801
modules/factory.py Normal file
View File

@@ -0,0 +1,801 @@
"""
MatchMe Studio - Factory Module (Concurrent Scene Generation)
Using Volcengine (Doubao) API for Image and Video
"""
import os
import time
import logging
import requests
import json
import re
import base64
import subprocess
from pathlib import Path
from typing import Dict, Any, List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from elevenlabs import ElevenLabs, VoiceSettings
from openai import OpenAI
import config
from modules import storage
logger = logging.getLogger(__name__)
# Initialize OpenAI Client for Volcengine Image Generation
client = OpenAI(
api_key=config.VOLC_API_KEY,
base_url=config.VOLC_BASE_URL
)
# ============================================================
# Helper Functions
# ============================================================
def _download_as_base64(url: str) -> str:
"""Download image from URL and convert to Base64."""
try:
response = requests.get(url)
response.raise_for_status()
return base64.b64encode(response.content).decode('utf-8')
except Exception as e:
logger.error(f"Failed to download/encode image: {e}")
return ""
# ============================================================
# Image Generation (Doubao / Volcengine)
# ============================================================
def generate_scene_image(
scene: Dict[str, Any],
brief: Dict[str, Any] = None,
reference_images: List[str] = None
) -> str:
"""
Generate image using Volcengine API (Doubao Image).
Using raw requests to match user's curl example exactly.
"""
# Build prompt
image_prompt = scene.get("image_prompt", "")
if not image_prompt:
# Fallback prompt construction
keyframe = scene.get("keyframe", {})
# Stronger style consistency intro
parts = ["Cinematic shot, 8k, photorealistic"]
if brief:
if brief.get("product_visual_description"):
parts.append(f"Product: {brief['product_visual_description']}")
parts.extend([
f"Subject: {keyframe.get('subject', 'product')}",
f"Environment: {keyframe.get('environment', 'studio')}",
f"Action: {keyframe.get('focus', '')}"
])
image_prompt = ", ".join(parts)
# Append explicit consistency enforcement to prompt
if brief and brief.get("product_visual_description"):
if brief['product_visual_description'] not in image_prompt:
image_prompt = f"{brief['product_visual_description']}, {image_prompt}"
logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")
url = f"{config.VOLC_BASE_URL}/images/generations"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
# Payload matching user's curl example
payload = {
"model": config.IMAGE_MODEL_ID,
"prompt": image_prompt,
"sequential_image_generation": "disabled",
"response_format": "b64_json", # Use base64 to avoid temp url expiration issues
"size": "2K", # User specified 2K
"stream": False,
"watermark": True
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code != 200:
logger.error(f"Image API Error: {response.text}")
raise ValueError(f"Image API failed: {response.status_code} - {response.text}")
data = response.json()
# Extract Image Data
image_data = None
if "data" in data and len(data["data"]) > 0:
image_data = data["data"][0].get("b64_json")
if not image_data:
# Fallback to URL download if b64 not present
img_url = data["data"][0].get("url")
if img_url:
# Download the image to ensure we have it locally
image_data = _download_as_base64(img_url)
if not image_data:
raise ValueError("No image data returned")
# Decode and Save
filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
local_path = config.TEMP_DIR / filename
with open(local_path, "wb") as f:
f.write(base64.b64decode(image_data))
# Upload to R2
r2_url = storage.upload_file(str(local_path))
logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
return r2_url
except Exception as e:
logger.error(f"Image Generation Failed: {e}")
raise
def generate_all_scene_images_concurrent(
scenes: List[Dict[str, Any]],
brief: Dict[str, Any] = None,
reference_images: List[str] = None,
max_workers: int = 3
) -> List[str]:
"""Generate images for all scenes concurrently."""
logger.info(f"Generating {len(scenes)} images concurrently...")
image_urls = [None] * len(scenes)
def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
url = generate_scene_image(scene, brief, reference_images)
return index, url
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(generate_single, i, scene): i
for i, scene in enumerate(scenes)
}
for future in as_completed(futures):
index = futures[future]
try:
_, url = future.result()
image_urls[index] = url
except Exception as e:
logger.error(f"Scene {index+1} failed: {e}")
return image_urls
# ============================================================
# Video Generation (Doubao Video / PixelDance)
# ============================================================
def generate_scene_video(
start_frame_url: str,
motion_prompt: str,
duration: int = 5
) -> str:
"""
Generate video using Volcengine API (Async Task Flow).
"""
logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")
# 1. Create Task
create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
# Construct Content List (Text + Optional Image)
content_list = [
{
"type": "text",
"text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
}
]
if start_frame_url:
content_list.append({
"type": "image_url",
"image_url": {"url": start_frame_url}
})
payload = {
"model": config.VIDEO_MODEL_ID,
"content": content_list
}
try:
response = requests.post(create_url, headers=headers, json=payload, timeout=30)
if response.status_code != 200:
# 202 Accepted is also possible for async tasks
if response.status_code != 202:
logger.error(f"Video Task Creation Error: {response.text}")
raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")
data = response.json()
task_id = data.get("id")
if not task_id:
# Sometimes ID is in data.id or similar
task_id = data.get("data", {}).get("id")
if not task_id:
raise ValueError(f"No Task ID returned: {data}")
logger.info(f"Video Task Created: {task_id}. Polling for result...")
# 2. Poll for Result
# GET /contents/generations/tasks/{id}
max_retries = 60 # 5 mins max (5s interval)
video_url = None
for _ in range(max_retries):
time.sleep(5)
status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
resp = requests.get(status_url, headers=headers, timeout=30)
if resp.status_code == 200:
res_data = resp.json()
# Check status
# Structure usually: data.status = "succeeded" / "running" / "failed"
# Or top level status
status = res_data.get("status")
if not status and "data" in res_data:
status = res_data["data"].get("status")
if status == "succeeded" or status == "SUCCEEDED":
# Extract URL
content = res_data.get("data", {}).get("content", [])
if not content and "content" in res_data:
content = res_data["content"]
# Find video url in content
# Content is usually list of dicts with type='video' or 'video_url'
for item in content:
if item.get("video_url"):
video_url = item["video_url"]
break
if item.get("url"): # sometimes just url
video_url = item["url"]
break
if video_url:
break
elif status == "failed" or status == "FAILED":
reason = res_data.get("data", {}).get("error", "Unknown error")
raise ValueError(f"Video Generation Failed: {reason}")
# If running/queued, continue waiting
if not video_url:
raise TimeoutError("Video generation timed out or failed to return URL.")
# 3. Download and Upload to R2
logger.info(f"Video Generated. Downloading: {video_url}")
filename = f"vid_doubao_{int(time.time())}.mp4"
local_path = config.TEMP_DIR / filename
resp = requests.get(video_url, stream=True)
if resp.status_code != 200:
raise ValueError(f"Failed to download generated video: {resp.status_code}")
with open(local_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
r2_url = storage.upload_file(str(local_path))
return r2_url
except Exception as e:
logger.error(f"Video Generation Error: {e}")
raise
def generate_all_scene_videos_concurrent(
scenes: List[Dict[str, Any]],
image_urls: List[str],
max_workers: int = 2
) -> List[str]:
"""Generate videos concurrently."""
logger.info(f"Generating {len(scenes)} videos concurrently...")
video_urls = [None] * len(scenes)
def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
motion = scene.get("camera_movement", "slow zoom")
if scene.get("image_prompt"):
motion = f"{scene['image_prompt']}. {motion}"
duration = scene.get("duration", 5)
url = generate_scene_video(img_url, motion, duration)
return index, url
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(generate_single, i, scene, image_urls[i]): i
for i, scene in enumerate(scenes)
}
for future in as_completed(futures):
index = futures[future]
try:
_, url = future.result()
video_urls[index] = url
except Exception as e:
logger.error(f"Scene {index+1} video failed: {e}")
return video_urls
# ============================================================
# Audio Generation (ElevenLabs)
# ============================================================
def generate_voiceover(text: str, style: str = "") -> str:
"""Generate voiceover audio. Returns R2 URL."""
if not text or not text.strip():
return ""
stability = 0.3 if "ASMR" in style else 0.5
similarity = 0.9 if "ASMR" in style else 0.8
logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")
try:
el_client = ElevenLabs(api_key=config.XI_KEY)
audio_stream = el_client.text_to_speech.convert(
voice_id=config.ELEVENLABS_VOICE_ID,
text=text,
model_id=config.ELEVENLABS_MODEL,
voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
)
filename = f"vo_{int(time.time())}.mp3"
local_path = config.TEMP_DIR / filename
with open(local_path, "wb") as f:
for chunk in audio_stream:
f.write(chunk)
r2_url = storage.upload_file(str(local_path))
return r2_url
except Exception as e:
logger.error(f"Voiceover failed: {e}")
return ""
def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
"""Generate combined voiceover for all scenes."""
voiceovers = []
for s in scenes:
vo = s.get("voiceover", "")
if vo and vo.strip() and not vo.startswith("("):
voiceovers.append(vo.strip())
if not voiceovers:
return ""
full_text = " ".join(voiceovers)
return generate_voiceover(full_text, style)
# ============================================================
# Audio Generation (Edge TTS - 免费中文语音合成)
# ============================================================
# Edge TTS 中文音色预设 (免费,效果好)
EDGE_TTS_VOICES = {
# 女声
"sweet_female": "zh-CN-XiaoxiaoNeural", # 晓晓 - 甜美活泼(推荐)
"gentle_female": "zh-CN-XiaoyiNeural", # 晓伊 - 温柔知性
"lively_female": "zh-CN-XiaochenNeural", # 晓辰 - 活泼可爱
"broadcast_female": "zh-CN-XiaoqiuNeural", # 晓秋 - 新闻播报
# 男声
"general_male": "zh-CN-YunxiNeural", # 云希 - 温暖男声
"broadcast_male": "zh-CN-YunjianNeural", # 云健 - 专业播报
}
# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
VOLC_TTS_VOICES = {
# 抖音带货友好女声
"sweet_female": "zh_female_vv_uranus_bigtts", # viv 2.0 通用女声(甜美)
"lively_female": "zh_female_jitangnv_saturn_bigtts", # 鸡汤女(元气)
"broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘(新闻播报)- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
"meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
# 男声
"general_male": "zh_male_dayi_saturn_bigtts", # 大义(沉稳男声)
}
def generate_voiceover_edge(
text: str,
voice_type: str = "sweet_female",
rate: str = "+0%",
volume: str = "+0%",
output_path: str = None
) -> str:
"""
使用 Edge TTS 生成中文旁白(免费,效果好)
Args:
text: 旁白文本
voice_type: 音色类型(见 EDGE_TTS_VOICES或直接使用音色名
rate: 语速调整,如 "+10%", "-20%"
volume: 音量调整,如 "+10%", "-20%"
output_path: 输出路径
Returns:
音频文件路径
"""
import asyncio
import edge_tts
if not text or not text.strip():
logger.warning("Empty text provided for TTS")
return ""
# 获取音色
voice = EDGE_TTS_VOICES.get(voice_type, voice_type)
logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")
if not output_path:
filename = f"vo_edge_{int(time.time())}.mp3"
output_path = str(config.TEMP_DIR / filename)
async def _generate():
communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
await communicate.save(output_path)
# Simple retry logic for Edge TTS
max_retries = 3
for i in range(max_retries):
try:
asyncio.run(_generate())
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
logger.info(f"Edge TTS voiceover generated: {output_path}")
return output_path
except Exception as e:
logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
time.sleep(1.0) # wait before retry
logger.error("Edge TTS failed after retries.")
return ""
def generate_voiceover_volcengine_ws(
text: str,
voice_type: str = "sweet_female",
output_path: str = None,
timeout: int = 120
) -> str:
"""
使用火山 WebSocket Binary Demo 生成 TTS 音频
依赖目录:/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python
"""
if not text or not text.strip():
logger.warning("Empty text provided for TTS (ws)")
return ""
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
venv_python = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python")
demo_script = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/examples/volcengine/binary.py")
if not venv_python.exists() or not demo_script.exists():
logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
return ""
if not output_path:
output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")
cmd = [
str(venv_python),
str(demo_script),
"--appid", config.VOLC_TTS_APPID,
"--access_token", config.VOLC_TTS_ACCESS_TOKEN,
"--voice_type", voice_id,
"--text", text,
"--encoding", "mp3",
]
logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
try:
result = subprocess.run(
cmd,
cwd="/Volumes/Tony/video-flow/volcengine_binary_demo",
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
logger.error(f"Volc WS TTS failed: {result.stderr}")
return ""
# demo 保存在 cwd 下 voice_type.mp3
demo_out = Path("/Volumes/Tony/video-flow/volcengine_binary_demo") / f"{voice_id}.mp3"
if not demo_out.exists():
logger.error("Volc WS TTS output not found")
return ""
Path(output_path).write_bytes(demo_out.read_bytes())
logger.info(f"Volc WS TTS saved to {output_path}")
return output_path
except Exception as e:
logger.error(f"Volc WS TTS error: {e}")
return ""
def generate_voiceover_volcengine(
text: str,
voice_type: str = "sweet_female",
speed_ratio: float = 1.0,
volume_ratio: float = 1.0,
pitch_ratio: float = 1.0,
output_path: str = None
) -> str:
"""
使用火山引擎 TTS 生成中文旁白
Args:
text: 旁白文本
voice_type: 音色类型(见 VOLC_TTS_VOICES或直接使用音色 ID
speed_ratio: 语速0.5-2.0默认1.0
volume_ratio: 音量0.5-2.0默认1.0
pitch_ratio: 音调0.5-2.0默认1.0
output_path: 输出路径(可选,默认自动生成)
Returns:
音频文件路径
"""
import uuid
if not text or not text.strip():
logger.warning("Empty text provided for TTS")
return ""
# 获取音色 ID火山音色表 + fallback 自定义)
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")
# 先尝试 WebSocket Binary官方 demo 已验证可用)
ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
if ws_path:
return ws_path
# 若 WS 异常,再尝试 HTTP
url = "https://openspeech.bytedance.com/api/v1/tts"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
}
payload = {
"app": {
"appid": config.VOLC_TTS_APPID,
"token": config.VOLC_TTS_ACCESS_TOKEN,
"cluster": "volcano_tts"
},
"user": {
"uid": "video_flow_user"
},
"audio": {
"voice_type": voice_id,
"encoding": "mp3",
"speed_ratio": speed_ratio,
"volume_ratio": volume_ratio,
"pitch_ratio": pitch_ratio
},
"request": {
"reqid": str(uuid.uuid4()),
"text": text,
"text_type": "plain",
"operation": "query",
"with_timestamp": "1",
"extra_param": json.dumps({
"disable_markdown_filter": False
})
}
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code != 200:
logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
data = response.json()
ret_code = data.get("code")
if ret_code not in (0, 3000, 20000000):
error_msg = data.get("message", "Unknown error")
logger.error(f"Volcengine TTS Error: {error_msg}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
audio_data = data.get("data", "")
if not audio_data:
raise ValueError("No audio data returned")
if not output_path:
filename = f"vo_volc_{int(time.time())}.mp3"
output_path = str(config.TEMP_DIR / filename)
with open(output_path, "wb") as f:
f.write(base64.b64decode(audio_data))
logger.info(f"Voiceover generated (HTTP): {output_path}")
return output_path
except Exception as e:
logger.error(f"Volcengine TTS HTTP error: {e}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
def generate_voiceover_volcengine_long(
text: str,
voice_type: str = "sweet_female",
speed_ratio: float = 1.0,
output_path: str = None,
max_chunk_length: int = 300
) -> str:
"""
火山引擎 TTS 长文本处理(自动分段合成)
对于超过 max_chunk_length 的文本,自动分段合成后拼接
"""
if len(text) <= max_chunk_length:
return generate_voiceover_volcengine(
text=text,
voice_type=voice_type,
speed_ratio=speed_ratio,
output_path=output_path
)
logger.info(f"Long text ({len(text)} chars), splitting into chunks...")
# 按句子分段
import re
sentences = re.split(r'([。!?;.!?;])', text)
chunks = []
current_chunk = ""
for i in range(0, len(sentences) - 1, 2):
sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
if len(current_chunk) + len(sentence) <= max_chunk_length:
current_chunk += sentence
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
# 如果最后一段是奇数句子
if len(sentences) % 2 == 1 and sentences[-1]:
if chunks:
chunks[-1] += sentences[-1]
else:
chunks.append(sentences[-1])
logger.info(f"Split into {len(chunks)} chunks")
# 生成每段音频
chunk_files = []
for i, chunk in enumerate(chunks):
chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
try:
path = generate_voiceover_volcengine(
text=chunk,
voice_type=voice_type,
speed_ratio=speed_ratio,
output_path=chunk_path
)
chunk_files.append(path)
except Exception as e:
logger.error(f"Chunk {i} failed: {e}")
# 继续处理其他段落
if not chunk_files:
raise ValueError("All TTS chunks failed")
# 使用 FFmpeg 合并音频
if len(chunk_files) == 1:
if output_path:
import shutil
shutil.move(chunk_files[0], output_path)
return output_path
return chunk_files[0]
# 创建合并文件列表
concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
with open(concat_list, "w") as f:
for cf in chunk_files:
f.write(f"file '{cf}'\n")
if not output_path:
output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")
# FFmpeg 合并
import subprocess
cmd = [
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-i", str(concat_list),
"-c", "copy",
output_path
]
subprocess.run(cmd, capture_output=True, check=True)
# 清理临时文件
for cf in chunk_files:
try:
os.remove(cf)
except:
pass
concat_list.unlink(missing_ok=True)
logger.info(f"Merged voiceover: {output_path}")
return output_path
def generate_scene_voiceovers_volcengine(
scenes: List[Dict[str, Any]],
voice_type: str = "sweet_female",
output_dir: str = None
) -> List[str]:
"""
为每个场景单独生成旁白音频
Args:
scenes: 场景列表,每个场景包含 voiceover 字段
voice_type: 音色类型
output_dir: 输出目录
Returns:
音频文件路径列表
"""
if output_dir:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
else:
output_dir = config.TEMP_DIR
audio_paths = []
for i, scene in enumerate(scenes):
vo_text = scene.get("voiceover", "")
if not vo_text or not vo_text.strip() or vo_text.startswith("("):
# 无旁白或是注释
audio_paths.append("")
continue
try:
output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
path = generate_voiceover_volcengine(
text=vo_text.strip(),
voice_type=voice_type,
output_path=output_path
)
audio_paths.append(path)
except Exception as e:
logger.error(f"Scene {i+1} voiceover failed: {e}")
audio_paths.append("")
return audio_paths

708
modules/fancy_text.py Normal file
View File

@@ -0,0 +1,708 @@
"""
抖音风格花字生成模块
使用 Pillow 生成透明 PNG 图片,支持描边、渐变、气泡框等效果
"""
import os
import hashlib
import logging
from pathlib import Path
from typing import Dict, Any, Tuple, List, Optional
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import config
logger = logging.getLogger(__name__)
# 花字缓存目录
FANCY_TEXT_CACHE_DIR = config.TEMP_DIR / "fancy_text_cache"
FANCY_TEXT_CACHE_DIR.mkdir(exist_ok=True)
def _get_font(font_name: str = None, size: int = 48) -> ImageFont.FreeTypeFont:
"""获取字体对象,遇到无效字体会继续尝试下一候选,最后才降级为默认字体"""
candidates = []
if font_name and os.path.exists(font_name):
candidates.append(font_name)
else:
candidates.extend([
config.FONTS_DIR / "AlibabaPuHuiTi-Bold.ttf",
config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf",
config.FONTS_DIR / "NotoSansSC-Bold.otf",
config.FONTS_DIR / "NotoSansSC-Regular.otf",
])
candidates.extend([
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/STHeiti Medium.ttc",
"/Library/Fonts/Arial Unicode.ttf",
"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
"C:/Windows/Fonts/msyh.ttc",
"C:/Windows/Fonts/simhei.ttf",
])
for path in candidates:
if not path:
continue
p = str(path)
if not os.path.exists(p):
continue
if isinstance(path, Path) and path.stat().st_size < 10000:
continue
try:
return ImageFont.truetype(p, size)
except Exception as e:
logger.warning(f"Failed to load font {p}: {e}")
continue
logger.warning("No suitable font found, using default")
return ImageFont.load_default()
def _hex_to_rgb(hex_color: str) -> Tuple[int, int, int]:
"""十六进制颜色转 RGB"""
hex_color = hex_color.lstrip("#")
return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
def _get_text_size(text: str, font: ImageFont.FreeTypeFont) -> Tuple[int, int]:
"""获取文字尺寸"""
# 创建临时图像来测量文字
dummy_img = Image.new("RGBA", (1, 1))
draw = ImageDraw.Draw(dummy_img)
bbox = draw.textbbox((0, 0), text, font=font)
return bbox[2] - bbox[0], bbox[3] - bbox[1]
def _cache_key(text: str, style: Dict) -> str:
"""生成缓存键"""
content = f"{text}_{str(sorted(style.items()))}"
return hashlib.md5(content.encode()).hexdigest()
def create_text_with_stroke(
text: str,
font_size: int = 60,
font_color: str = "#FFFFFF",
stroke_color: str = "#000000",
stroke_width: int = 4,
font_name: str = None,
padding: int = 20
) -> Image.Image:
"""
创建带描边的文字图片
Args:
text: 文字内容
font_size: 字体大小
font_color: 字体颜色(十六进制)
stroke_color: 描边颜色
stroke_width: 描边宽度
font_name: 字体路径
padding: 内边距
Returns:
透明 PNG 图片
"""
font = _get_font(font_name, font_size)
text_w, text_h = _get_text_size(text, font)
# 图片尺寸(加上描边和内边距)
img_w = text_w + stroke_width * 2 + padding * 2
img_h = text_h + stroke_width * 2 + padding * 2
# 创建透明图片
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# 文字位置
x = padding + stroke_width
y = padding + stroke_width
# 绘制描边(通过偏移绘制多次)
stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
# 绘制主文字
font_rgb = _hex_to_rgb(font_color) + (255,)
draw.text((x, y), text, font=font, fill=font_rgb)
return img
def create_text_with_shadow(
text: str,
font_size: int = 60,
font_color: str = "#FFFFFF",
shadow_color: str = "#000000",
shadow_offset: Tuple[int, int] = (4, 4),
shadow_blur: int = 5,
font_name: str = None,
padding: int = 30,
stroke_color: str = None,
stroke_width: int = 0
) -> Image.Image:
"""
创建带阴影的文字图片,可选描边(用于双层安全描边)
"""
font = _get_font(font_name, font_size)
text_w, text_h = _get_text_size(text, font)
# 图片尺寸
extra = max(shadow_blur, stroke_width * 2)
img_w = text_w + abs(shadow_offset[0]) + extra * 2 + padding * 2
img_h = text_h + abs(shadow_offset[1]) + extra * 2 + padding * 2
shadow_img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
shadow_draw = ImageDraw.Draw(shadow_img)
x = padding + extra
y = padding + extra
# 阴影
shadow_rgb = _hex_to_rgb(shadow_color) + (180,)
shadow_draw.text((x + shadow_offset[0], y + shadow_offset[1]), text, font=font, fill=shadow_rgb)
shadow_img = shadow_img.filter(ImageFilter.GaussianBlur(shadow_blur))
draw = ImageDraw.Draw(shadow_img)
# 可选描边(外层深色或浅色)
if stroke_color and stroke_width > 0:
stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
# 主文字
font_rgb = _hex_to_rgb(font_color) + (255,)
draw.text((x, y), text, font=font, fill=font_rgb)
return shadow_img
def create_text_with_gradient(
text: str,
font_size: int = 60,
gradient_colors: List[str] = None,
gradient_direction: str = "vertical", # vertical, horizontal
stroke_color: str = "#000000",
stroke_width: int = 3,
font_name: str = None,
padding: int = 20
) -> Image.Image:
"""
创建渐变色文字图片
Args:
gradient_colors: 渐变颜色列表,如 ["#FF6B6B", "#FFE66D"]
gradient_direction: 渐变方向
"""
if not gradient_colors:
gradient_colors = ["#FF6B6B", "#FFE66D"] # 默认红黄渐变
font = _get_font(font_name, font_size)
text_w, text_h = _get_text_size(text, font)
img_w = text_w + stroke_width * 2 + padding * 2
img_h = text_h + stroke_width * 2 + padding * 2
# 创建渐变图层
gradient = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
gradient_draw = ImageDraw.Draw(gradient)
# 生成渐变
colors = [_hex_to_rgb(c) for c in gradient_colors]
for i in range(img_h if gradient_direction == "vertical" else img_w):
ratio = i / (img_h if gradient_direction == "vertical" else img_w)
# 线性插值颜色
if ratio < 0.5:
r = ratio * 2
c1, c2 = colors[0], colors[min(1, len(colors) - 1)]
else:
r = (ratio - 0.5) * 2
c1 = colors[min(1, len(colors) - 1)]
c2 = colors[min(2, len(colors) - 1)] if len(colors) > 2 else c1
color = tuple(int(c1[j] + (c2[j] - c1[j]) * r) for j in range(3)) + (255,)
if gradient_direction == "vertical":
gradient_draw.line([(0, i), (img_w, i)], fill=color)
else:
gradient_draw.line([(i, 0), (i, img_h)], fill=color)
# 创建文字蒙版
mask = Image.new("L", (img_w, img_h), 0)
mask_draw = ImageDraw.Draw(mask)
x = padding + stroke_width
y = padding + stroke_width
# 先绘制描边蒙版
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
mask_draw.text((x + dx, y + dy), text, font=font, fill=128)
# 主文字蒙版
mask_draw.text((x, y), text, font=font, fill=255)
# 创建结果图片
result = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
# 绘制描边
stroke_img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
stroke_draw = ImageDraw.Draw(stroke_img)
stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
stroke_draw.text((x + dx, y + dy), text, font=font, fill=stroke_rgb)
result = Image.alpha_composite(result, stroke_img)
# 应用渐变到文字
text_mask = Image.new("L", (img_w, img_h), 0)
ImageDraw.Draw(text_mask).text((x, y), text, font=font, fill=255)
gradient_text = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
gradient_text.paste(gradient, mask=text_mask)
result = Image.alpha_composite(result, gradient_text)
return result
def create_bubble_text(
text: str,
font_size: int = 48,
font_color: str = "#333333",
bg_color: str = "#FFFFFF",
border_color: str = "#CCCCCC",
border_width: int = 2,
corner_radius: int = 20,
padding: Tuple[int, int] = (30, 15),
font_name: str = None,
tail_direction: str = None # "left", "right", "bottom", None
) -> Image.Image:
"""
创建气泡框文字(对话框效果)
Args:
tail_direction: 气泡尾巴方向
"""
font = _get_font(font_name, font_size)
text_w, text_h = _get_text_size(text, font)
# 气泡尺寸
bubble_w = text_w + padding[0] * 2
bubble_h = text_h + padding[1] * 2
# 增加尾巴空间
tail_size = 20 if tail_direction else 0
if tail_direction in ["left", "right"]:
img_w = bubble_w + tail_size
img_h = bubble_h
elif tail_direction == "bottom":
img_w = bubble_w
img_h = bubble_h + tail_size
else:
img_w = bubble_w
img_h = bubble_h
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# 气泡位置
if tail_direction == "left":
bx = tail_size
else:
bx = 0
by = 0
# 绘制圆角矩形
bg_rgb = _hex_to_rgb(bg_color) + (255,)
border_rgb = _hex_to_rgb(border_color) + (255,)
# 使用圆角矩形
draw.rounded_rectangle(
[bx, by, bx + bubble_w, by + bubble_h],
radius=corner_radius,
fill=bg_rgb,
outline=border_rgb,
width=border_width
)
# 绘制尾巴
if tail_direction == "left":
points = [
(bx, bubble_h // 2 - 10),
(0, bubble_h // 2),
(bx, bubble_h // 2 + 10)
]
draw.polygon(points, fill=bg_rgb, outline=border_rgb)
# 覆盖边框内部分
draw.polygon(points, fill=bg_rgb)
elif tail_direction == "right":
points = [
(bx + bubble_w, bubble_h // 2 - 10),
(img_w, bubble_h // 2),
(bx + bubble_w, bubble_h // 2 + 10)
]
draw.polygon(points, fill=bg_rgb, outline=border_rgb)
draw.polygon(points, fill=bg_rgb)
elif tail_direction == "bottom":
points = [
(bubble_w // 2 - 10, bubble_h),
(bubble_w // 2, img_h),
(bubble_w // 2 + 10, bubble_h)
]
draw.polygon(points, fill=bg_rgb, outline=border_rgb)
draw.polygon(points, fill=bg_rgb)
# 绘制文字
font_rgb = _hex_to_rgb(font_color) + (255,)
text_x = bx + padding[0]
text_y = by + padding[1]
draw.text((text_x, text_y), text, font=font, fill=font_rgb)
return img
def create_price_tag(
price: str,
currency: str = "¥",
font_size: int = 72,
price_color: str = "#FF4444",
currency_color: str = "#FF4444",
stroke_color: str = "#FFFFFF",
stroke_width: int = 4,
font_name: str = None
) -> Image.Image:
"""
创建价格标签(电商风格)
"""
font_large = _get_font(font_name, font_size)
font_small = _get_font(font_name, int(font_size * 0.5))
# 测量尺寸
currency_w, currency_h = _get_text_size(currency, font_small)
price_w, price_h = _get_text_size(price, font_large)
total_w = currency_w + price_w + 5
total_h = max(currency_h, price_h)
padding = stroke_width + 10
img_w = total_w + padding * 2
img_h = total_h + padding * 2
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# 绘制描边
stroke_rgb = _hex_to_rgb(stroke_color) + (255,)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
# 货币符号
draw.text(
(padding + dx, padding + (total_h - currency_h) // 2 + dy),
currency, font=font_small, fill=stroke_rgb
)
# 价格
draw.text(
(padding + currency_w + 5 + dx, padding + (total_h - price_h) // 2 + dy),
price, font=font_large, fill=stroke_rgb
)
# 绘制文字
currency_rgb = _hex_to_rgb(currency_color) + (255,)
price_rgb = _hex_to_rgb(price_color) + (255,)
draw.text(
(padding, padding + (total_h - currency_h) // 2),
currency, font=font_small, fill=currency_rgb
)
draw.text(
(padding + currency_w + 5, padding + (total_h - price_h) // 2),
price, font=font_large, fill=price_rgb
)
return img
def create_button(
text: str,
font_size: int = 36,
font_color: str = "#FFFFFF",
bg_color: str = "#FF6B35",
corner_radius: int = 25,
padding: Tuple[int, int] = (40, 15),
font_name: str = None,
shadow: bool = True
) -> Image.Image:
"""
创建按钮样式文字(如"立即抢购"
"""
font = _get_font(font_name, font_size)
text_w, text_h = _get_text_size(text, font)
btn_w = text_w + padding[0] * 2
btn_h = text_h + padding[1] * 2
shadow_offset = 4 if shadow else 0
img_w = btn_w + shadow_offset
img_h = btn_h + shadow_offset
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# 绘制阴影
if shadow:
shadow_color = (0, 0, 0, 80)
draw.rounded_rectangle(
[shadow_offset, shadow_offset, btn_w + shadow_offset, btn_h + shadow_offset],
radius=corner_radius,
fill=shadow_color
)
# 绘制按钮背景
bg_rgb = _hex_to_rgb(bg_color) + (255,)
draw.rounded_rectangle(
[0, 0, btn_w, btn_h],
radius=corner_radius,
fill=bg_rgb
)
# 绘制文字
font_rgb = _hex_to_rgb(font_color) + (255,)
text_x = padding[0]
text_y = padding[1]
draw.text((text_x, text_y), text, font=font, fill=font_rgb)
return img
def create_comparison_text(
left_text: str,
right_text: str,
vs_text: str = "vs",
font_size: int = 48,
left_color: str = "#666666",
right_color: str = "#FF6B35",
vs_color: str = "#FF0000",
font_name: str = None
) -> Image.Image:
"""
创建对比文字(如"塌马尾 vs 高颅顶"
"""
font = _get_font(font_name, font_size)
font_vs = _get_font(font_name, int(font_size * 0.8))
left_w, left_h = _get_text_size(left_text, font)
vs_w, vs_h = _get_text_size(vs_text, font_vs)
right_w, right_h = _get_text_size(right_text, font)
spacing = 15
total_w = left_w + vs_w + right_w + spacing * 2
total_h = max(left_h, vs_h, right_h)
padding = 20
stroke_width = 3
img_w = total_w + padding * 2 + stroke_width * 2
img_h = total_h + padding * 2 + stroke_width * 2
img = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
x = padding + stroke_width
y = padding + stroke_width
# 描边
stroke_color = (0, 0, 0, 255)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx * dx + dy * dy <= stroke_width * stroke_width:
draw.text((x + dx, y + (total_h - left_h) // 2 + dy), left_text, font=font, fill=stroke_color)
draw.text((x + left_w + spacing + dx, y + (total_h - vs_h) // 2 + dy), vs_text, font=font_vs, fill=stroke_color)
draw.text((x + left_w + spacing + vs_w + spacing + dx, y + (total_h - right_h) // 2 + dy), right_text, font=font, fill=stroke_color)
# 绘制文字
left_rgb = _hex_to_rgb(left_color) + (255,)
vs_rgb = _hex_to_rgb(vs_color) + (255,)
right_rgb = _hex_to_rgb(right_color) + (255,)
draw.text((x, y + (total_h - left_h) // 2), left_text, font=font, fill=left_rgb)
draw.text((x + left_w + spacing, y + (total_h - vs_h) // 2), vs_text, font=font_vs, fill=vs_rgb)
draw.text((x + left_w + spacing + vs_w + spacing, y + (total_h - right_h) // 2), right_text, font=font, fill=right_rgb)
return img
# ============================================================
# 预设样式
# ============================================================
PRESET_STYLES = {
"subtitle": {
"font_size": 48,
"font_color": "#FFFFFF",
"stroke_color": "#000000",
"stroke_width": 3,
"version": "v2"
},
"highlight": {
# 暖米白主色 + 浅描边 + 暗色阴影,匹配浅棕背景
"font_size": 90,
"font_color": "#F7E7D3",
"stroke_color": "#C9B59A", # 浅描边
"stroke_width": 4,
"type": "shadow",
"shadow_color": "#3A2C1F", # 暗棕阴影
"shadow_offset": (3, 3),
"shadow_blur": 10,
"padding": 32,
"version": "gloda"
},
"warning": {
# 低饱和陶土红 + 米色描边 + 暗棕阴影
"font_size": 80,
"font_color": "#D96B4F",
"stroke_color": "#F6E5D6",
"stroke_width": 4,
"type": "shadow",
"shadow_color": "#3A2C1F",
"shadow_offset": (3, 3),
"shadow_blur": 10,
"padding": 30,
"version": "gloda"
},
"success": {
"font_size": 52,
"font_color": "#4CAF50",
"stroke_color": "#FFFFFF",
"stroke_width": 4,
"version": "v2"
},
"price": {
# 价格标签:温暖红 + 米白货币符号 + 暗描边
"font_size": 110,
"price_color": "#E25B4F",
"currency_color": "#F6E5D6",
"stroke_color": "#3A2C1F",
"stroke_width": 8,
"type": "price",
"version": "gloda"
},
"cta_button": {
# 暖橙按钮,轻阴影
"font_size": 46,
"font_color": "#FFFFFF",
"bg_color": "#E6763A",
"corner_radius": 32,
"type": "button",
"shadow": True,
"version": "gloda"
}
}
def create_fancy_text(
text: str,
style: str = "subtitle",
custom_style: Dict[str, Any] = None,
cache: bool = True
) -> str:
"""
创建花字图片的统一入口
Args:
text: 文字内容
style: 预设样式名称
custom_style: 自定义样式(覆盖预设)
cache: 是否缓存
Returns:
PNG 图片路径
"""
# 合并样式
base_style = PRESET_STYLES.get(style, PRESET_STYLES["subtitle"]).copy()
if custom_style:
base_style.update(custom_style)
# 检查缓存
if cache:
cache_name = _cache_key(text, base_style)
cache_path = FANCY_TEXT_CACHE_DIR / f"{cache_name}.png"
if cache_path.exists():
return str(cache_path)
# 根据样式类型创建图片
style_type = base_style.pop("type", None)
if style == "price" or style_type == "price":
img = create_price_tag(text, **{k: v for k, v in base_style.items() if k in [
"currency", "font_size", "price_color", "currency_color", "stroke_color", "stroke_width", "font_name"
]})
elif style == "cta_button" or style_type == "button":
img = create_button(text, **{k: v for k, v in base_style.items() if k in [
"font_size", "font_color", "bg_color", "corner_radius", "padding", "font_name", "shadow"
]})
elif style_type == "bubble":
img = create_bubble_text(text, **{k: v for k, v in base_style.items() if k in [
"font_size", "font_color", "bg_color", "border_color", "border_width",
"corner_radius", "padding", "font_name", "tail_direction"
]})
elif style_type == "gradient":
img = create_text_with_gradient(text, **{k: v for k, v in base_style.items() if k in [
"font_size", "gradient_colors", "gradient_direction", "stroke_color", "stroke_width", "font_name", "padding"
]})
elif style_type == "shadow":
img = create_text_with_shadow(text, **{k: v for k, v in base_style.items() if k in [
"font_size", "font_color", "shadow_color", "shadow_offset", "shadow_blur", "font_name", "padding"
]})
else:
# 默认带描边文字
img = create_text_with_stroke(text, **{k: v for k, v in base_style.items() if k in [
"font_size", "font_color", "stroke_color", "stroke_width", "font_name", "padding"
]})
# 保存
if cache:
output_path = str(cache_path)
else:
output_path = str(config.TEMP_DIR / f"fancy_{hash(text)}_{os.getpid()}.png")
img.save(output_path, "PNG")
logger.info(f"Created fancy text: '{text[:20]}...' -> {output_path}")
return output_path
def batch_create_fancy_texts(
configs: List[Dict[str, Any]]
) -> List[str]:
"""
批量创建花字图片
Args:
configs: 配置列表 [{text, style, custom_style}]
Returns:
PNG 图片路径列表
"""
paths = []
for cfg in configs:
path = create_fancy_text(
text=cfg.get("text", ""),
style=cfg.get("style", "subtitle"),
custom_style=cfg.get("custom_style")
)
paths.append(path)
return paths

960
modules/ffmpeg_utils.py Normal file
View File

@@ -0,0 +1,960 @@
"""
FFmpeg 视频处理工具模块
支持规模化批量视频处理:拼接、字幕、叠加、混音
"""
import os
import re
import subprocess
import tempfile
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import config
logger = logging.getLogger(__name__)
# FFmpeg/FFprobe 路径(优先使用项目内的二进制)
FFMPEG_PATH = str(config.BASE_DIR / "bin" / "ffmpeg") if (config.BASE_DIR / "bin" / "ffmpeg").exists() else "ffmpeg"
FFPROBE_PATH = str(config.BASE_DIR / "bin" / "ffprobe") if (config.BASE_DIR / "bin" / "ffprobe").exists() else "ffprobe"
# 字体路径优先使用项目自带中文字体,其次使用 Linux 系统字体,最后再回退到 macOS 路径
DEFAULT_FONT_PATHS = [
# 优先使用 Linux 系统级中文字体 (服务器环境最稳健)
"/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
"/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc",
# 项目内字体 (注意:需确保文件不是 LFS 指针)
str(config.FONTS_DIR / "HarmonyOS-Sans-SC-Regular.ttf"),
str(config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf"),
# macOS 字体(仅本地调试生效)
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/STHeiti Medium.ttc",
"/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
]
def _get_font_path() -> str:
for p in DEFAULT_FONT_PATHS:
if os.path.exists(p) and os.path.getsize(p) > 1000:
return p
return "Arial" # 极端情况下退回英文字体,避免崩溃
def _sanitize_text(text: str) -> str:
"""
去除可能导致 ffmpeg 命令行错误的特殊控制字符,但保留 Emoji、数字、标点和各国语言。
"""
if not text:
return ""
# 不再过滤任何字符,只确保不是 None
return text
def add_silence_audio(video_path: str, output_path: str) -> str:
"""
给无音轨的视频补一条静音轨(立体声 44.1k),避免后续 filter 找不到 0:a
"""
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-f", "lavfi",
"-i", "anullsrc=channel_layout=stereo:sample_rate=44100",
"-shortest",
"-c:v", "copy",
"-c:a", "aac",
output_path
]
_run_ffmpeg(cmd)
return output_path
def _run_ffmpeg(cmd: List[str], check: bool = True) -> subprocess.CompletedProcess:
"""执行 FFmpeg 命令"""
logger.debug(f"FFmpeg command: {' '.join(cmd)}")
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=check
)
# 无论成功失败,输出 stderr 以便排查字体等警告
if result.stderr:
print(f"[FFmpeg stderr] {result.stderr}", flush=True)
if result.returncode != 0:
logger.error(f"FFmpeg stderr: {result.stderr}")
return result
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg failed: {e.stderr}")
raise
def get_video_info(video_path: str) -> Dict[str, Any]:
"""获取视频信息(时长、分辨率、帧率等)"""
cmd = [
FFPROBE_PATH,
"-v", "quiet",
"-print_format", "json",
"-show_format",
"-show_streams",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise ValueError(f"Failed to probe video: {video_path}")
import json
data = json.loads(result.stdout)
# 提取关键信息
info = {
"duration": float(data.get("format", {}).get("duration", 0)),
"width": 0,
"height": 0,
"fps": 30
}
for stream in data.get("streams", []):
if stream.get("codec_type") == "video":
info["width"] = stream.get("width", 0)
info["height"] = stream.get("height", 0)
# 解析帧率 (如 "30/1" 或 "29.97")
fps_str = stream.get("r_frame_rate", "30/1")
if "/" in fps_str:
num, den = fps_str.split("/")
info["fps"] = float(num) / float(den) if float(den) != 0 else 30
else:
info["fps"] = float(fps_str)
break
return info
def concat_videos(
video_paths: List[str],
output_path: str,
target_size: Tuple[int, int] = (1080, 1920)
) -> str:
"""
使用 FFmpeg concat demuxer 拼接多段视频
Args:
video_paths: 视频文件路径列表
output_path: 输出文件路径
target_size: 目标分辨率 (width, height),默认竖屏 1080x1920
Returns:
输出文件路径
"""
if not video_paths:
raise ValueError("No video paths provided")
logger.info(f"Concatenating {len(video_paths)} videos...")
# 创建 concat 文件列表
concat_file = config.TEMP_DIR / f"concat_{os.getpid()}.txt"
with open(concat_file, "w", encoding="utf-8") as f:
for vp in video_paths:
# 使用绝对路径并转义单引号
abs_path = os.path.abspath(vp)
f.write(f"file '{abs_path}'\n")
width, height = target_size
# 使用 filter_complex 统一分辨率后拼接
# 每个视频先 scale + pad 到目标尺寸
filter_parts = []
for i in range(len(video_paths)):
# scale 保持宽高比pad 填充黑边居中
filter_parts.append(
f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1[v{i}]"
)
# 拼接所有视频流
concat_inputs = "".join([f"[v{i}]" for i in range(len(video_paths))])
filter_parts.append(f"{concat_inputs}concat=n={len(video_paths)}:v=1:a=0[outv]")
filter_complex = ";".join(filter_parts)
# 构建 ffmpeg 命令
cmd = [FFMPEG_PATH, "-y"]
for vp in video_paths:
cmd.extend(["-i", vp])
cmd.extend([
"-filter_complex", filter_complex,
"-map", "[outv]",
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-pix_fmt", "yuv420p",
output_path
])
_run_ffmpeg(cmd)
# 清理临时文件
if concat_file.exists():
concat_file.unlink()
logger.info(f"Concatenated video saved: {output_path}")
return output_path
def concat_videos_with_audio(
video_paths: List[str],
output_path: str,
target_size: Tuple[int, int] = (1080, 1920)
) -> str:
"""
拼接视频并保留音频轨道
"""
if not video_paths:
raise ValueError("No video paths provided")
logger.info(f"Concatenating {len(video_paths)} videos with audio...")
width, height = target_size
n = len(video_paths)
# 构建 filter_complex
filter_parts = []
# 视频处理
for i in range(n):
filter_parts.append(
f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1[v{i}]"
)
# 音频处理(静音填充如果没有音频)
for i in range(n):
filter_parts.append(f"[{i}:a]aformat=sample_rates=44100:channel_layouts=stereo[a{i}]")
# 拼接
v_concat = "".join([f"[v{i}]" for i in range(n)])
a_concat = "".join([f"[a{i}]" for i in range(n)])
filter_parts.append(f"{v_concat}concat=n={n}:v=1:a=0[outv]")
filter_parts.append(f"{a_concat}concat=n={n}:v=0:a=1[outa]")
filter_complex = ";".join(filter_parts)
cmd = [FFMPEG_PATH, "-y"]
for vp in video_paths:
cmd.extend(["-i", vp])
cmd.extend([
"-filter_complex", filter_complex,
"-map", "[outv]",
"-map", "[outa]",
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "aac",
"-b:a", "128k",
"-pix_fmt", "yuv420p",
output_path
])
try:
_run_ffmpeg(cmd)
except subprocess.CalledProcessError:
# 如果音频拼接失败,回退到无音频版本
logger.warning("Audio concat failed, falling back to video only")
return concat_videos(video_paths, output_path, target_size)
logger.info(f"Concatenated video with audio saved: {output_path}")
return output_path
def add_subtitle(
video_path: str,
text: str,
start: float,
duration: float,
output_path: str,
style: Dict[str, Any] = None
) -> str:
"""
使用 drawtext filter 添加单条字幕
Args:
video_path: 输入视频路径
text: 字幕文本
start: 开始时间(秒)
duration: 持续时间(秒)
output_path: 输出路径
style: 样式配置 {
fontsize: 字体大小,
fontcolor: 字体颜色,
borderw: 描边宽度,
bordercolor: 描边颜色,
x: x位置 (可用表达式如 "(w-text_w)/2"),
y: y位置,
font: 字体路径或名称
}
Returns:
输出文件路径
"""
style = style or {}
# 默认样式
fontsize = style.get("fontsize", 48)
fontcolor = style.get("fontcolor", "white")
borderw = style.get("borderw", 3)
bordercolor = style.get("bordercolor", "black")
x = style.get("x", "(w-text_w)/2") # 默认水平居中
y = style.get("y", "h-200") # 默认底部偏上
# 优先使用动态检测到的有效字体,而不是硬编码的可能损坏的路径
default_font_path = _get_font_path()
font = style.get("font", default_font_path)
# 转义特殊字符
escaped_text = text.replace("'", "\\'").replace(":", "\\:")
# drawtext filter
drawtext = (
f"drawtext=text='{escaped_text}':"
f"fontfile='{font}':"
f"fontsize={fontsize}:"
f"fontcolor={fontcolor}:"
f"borderw={borderw}:"
f"bordercolor={bordercolor}:"
f"x={x}:y={y}:"
f"enable='between(t,{start},{start + duration})'"
)
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-vf", drawtext,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "copy",
"-pix_fmt", "yuv420p",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Added subtitle: '{text[:20]}...' at {start}s")
return output_path
def wrap_text(text: str, max_chars: int = 18) -> str:
"""
简单的文本换行处理
"""
if not text: return ""
# 如果已经有换行符,假设用户已经手动处理
if "\n" in text:
return text
result = ""
count = 0
for char in text:
if count >= max_chars:
result += "\n"
count = 0
result += char
# 简单估算中文算1个英文也算1个等宽字体
# 实际上中英文混合较复杂,这里简化处理
count += 1
return result
def mix_audio_at_offset(
base_audio: str,
overlay_audio: str,
offset: float,
output_path: str,
base_volume: float = 1.0,
overlay_volume: float = 1.0
) -> str:
"""
在指定偏移位置混合音频
"""
# 如果 base_audio 不存在,创建一个静音底
if not os.path.exists(base_audio):
logger.warning(f"Base audio not found: {base_audio}")
return overlay_audio
cmd = [
FFMPEG_PATH, "-y",
"-i", base_audio,
"-i", overlay_audio,
"-filter_complex",
f"[0:a]volume={base_volume}[a0];[1:a]volume={overlay_volume},adelay={int(offset*1000)}|{int(offset*1000)}[a1];[a0][a1]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[out]",
"-map", "[out]",
"-c:a", "mp3", # Use MP3 for audio only mixing
output_path
]
_run_ffmpeg(cmd)
return output_path
def adjust_audio_duration(
input_path: str,
target_duration: float,
output_path: str
) -> str:
"""
调整音频时长(仅在音频过长时加速,音频较短时保持原速)
用户需求:
- 音频时长 > 目标时长 → 加速播放
- 音频时长 <= 目标时长 → 保持原速(不慢放)
"""
if not os.path.exists(input_path):
return None
current_duration = float(get_audio_info(input_path).get("duration", 0))
if current_duration <= 0:
return input_path
# 只在音频过长时才加速,音频较短时保持原速
if current_duration <= target_duration:
# 音频时长 <= 目标时长,不需要调整,直接复制
import shutil
shutil.copy(input_path, output_path)
logger.info(f"Audio ({current_duration:.2f}s) <= target ({target_duration:.2f}s), keeping original speed")
return output_path
# 音频过长,需要加速
speed_ratio = current_duration / target_duration
# 限制加速范围 (最多2倍速),避免声音变调太严重
speed_ratio = min(speed_ratio, 2.0)
logger.info(f"Audio ({current_duration:.2f}s) > target ({target_duration:.2f}s), speeding up {speed_ratio:.2f}x")
cmd = [
FFMPEG_PATH, "-y",
"-i", input_path,
"-filter:a", f"atempo={speed_ratio}",
output_path
]
_run_ffmpeg(cmd)
return output_path
def get_audio_info(file_path: str) -> Dict[str, Any]:
"""获取音频信息"""
return get_video_info(file_path)
def wrap_text_smart(text: str, max_chars: int = 15) -> str:
"""
智能字幕换行(上短下长策略)
"""
if not text or len(text) <= max_chars:
return text
# 优先在标点或空格处换行
split_chars = ["", "", "", "", " ", ",", ".", "!", "?"]
best_split = -1
# 寻找中间附近的分割点
mid = len(text) // 2
for i in range(len(text)):
if text[i] in split_chars:
# 偏好后半部分(上短下长)
if abs(i - mid) < abs(best_split - mid):
best_split = i
if best_split != -1 and best_split < len(text) - 1:
return text[:best_split+1] + "\n" + text[best_split+1:]
# 强制换行(上短下长)
split_idx = int(len(text) * 0.4) # 上面 40%
return text[:split_idx] + "\n" + text[split_idx:]
def add_multiple_subtitles(
video_path: str,
subtitles: List[Dict[str, Any]],
output_path: str,
default_style: Dict[str, Any] = None
) -> str:
"""
添加多条字幕
"""
if not subtitles:
# 无字幕直接复制
import shutil
shutil.copy(video_path, output_path)
return output_path
default_style = default_style or {}
# 强制使用完整字体(先用项目内 NotoSansSC如果不存在则回退 Droid
font = "/root/video-flow/assets/fonts/NotoSansSC-Regular.otf"
if not (os.path.exists(font) and os.path.getsize(font) > 1024 * 100): # 至少100KB以上认为有效
font = "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf"
if not (os.path.exists(font) and os.path.getsize(font) > 1024 * 100):
font = _get_font_path()
print(f"[SubDebug] Using font for subtitles: {font}", flush=True)
# 构建多个 drawtext filter
filters = []
for sub in subtitles:
raw_text = sub.get("text", "")
# 打印原始文本的 repr 和 hex以便排查特殊字符
print(f"[SubDebug] Subtitle text repr: {repr(raw_text)}", flush=True)
print(f"[SubDebug] Subtitle text hex: {' '.join(hex(ord(c)) for c in raw_text)}", flush=True)
text = _sanitize_text(raw_text)
# 自动换行
text = wrap_text(text)
start = sub.get("start", 0)
duration = sub.get("duration", 3)
style = {**default_style, **sub.get("style", {})}
fontsize = style.get("fontsize", 48)
fontcolor = style.get("fontcolor", "white")
borderw = style.get("borderw", 3)
bordercolor = style.get("bordercolor", "black")
x = style.get("x", "(w-text_w)/2")
y = style.get("y", "h-200")
# 默认启用背景框以提高可读性
box = style.get("box", 1)
boxcolor = style.get("boxcolor", "black@0.5")
boxborderw = style.get("boxborderw", 10)
# 转义:反斜杠、单引号、冒号、百分号
escaped_text = text.replace("\\", "\\\\").replace("'", "\\'").replace(":", "\\:").replace("%", "\\%")
drawtext = (
f"drawtext=text='{escaped_text}':"
f"fontfile='{font}':"
f"fontsize={fontsize}:"
f"fontcolor={fontcolor}:"
f"borderw={borderw}:"
f"bordercolor={bordercolor}:"
f"box={box}:boxcolor={boxcolor}:boxborderw={boxborderw}:"
f"x={x}:y={y}:"
f"enable='between(t,{start},{start + duration})'"
)
filters.append(drawtext)
# 用逗号连接多个 filter
vf = ",".join(filters)
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-vf", vf,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "copy",
"-pix_fmt", "yuv420p",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Added {len(subtitles)} subtitles")
return output_path
def overlay_image(
video_path: str,
image_path: str,
output_path: str,
position: Tuple[int, int] = None,
start: float = 0,
duration: float = None,
fade_in: float = 0,
fade_out: float = 0
) -> str:
"""
叠加透明PNG图片花字、水印等到视频
Args:
video_path: 输入视频路径
image_path: PNG图片路径支持透明通道
output_path: 输出路径
position: (x, y) 位置None则居中
start: 开始时间(秒)
duration: 持续时间None则到视频结束
fade_in: 淡入时间(秒)
fade_out: 淡出时间(秒)
Returns:
输出文件路径
"""
# 获取视频信息
info = get_video_info(video_path)
video_duration = info["duration"]
if duration is None:
duration = video_duration - start
# 位置
if position:
x, y = position
pos_str = f"x={x}:y={y}"
else:
pos_str = "x=(W-w)/2:y=(H-h)/2" # 居中
# 时间控制
enable = f"enable='between(t,{start},{start + duration})'"
# 构建 overlay filter
overlay_filter = f"overlay={pos_str}:{enable}"
# 添加淡入淡出效果
if fade_in > 0 or fade_out > 0:
fade_filter = []
if fade_in > 0:
fade_filter.append(f"fade=t=in:st={start}:d={fade_in}:alpha=1")
if fade_out > 0:
fade_out_start = start + duration - fade_out
fade_filter.append(f"fade=t=out:st={fade_out_start}:d={fade_out}:alpha=1")
img_filter = ",".join(fade_filter) if fade_filter else ""
filter_complex = f"[1:v]{img_filter}[img];[0:v][img]{overlay_filter}[outv]"
else:
filter_complex = f"[0:v][1:v]{overlay_filter}[outv]"
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-i", image_path,
"-filter_complex", filter_complex,
"-map", "[outv]",
"-map", "0:a?",
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "copy",
"-pix_fmt", "yuv420p",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Overlaid image at {position or 'center'}, {start}s-{start+duration}s")
return output_path
def overlay_multiple_images(
video_path: str,
images: List[Dict[str, Any]],
output_path: str
) -> str:
"""
叠加多个透明PNG图片
Args:
video_path: 输入视频路径
images: 图片配置列表 [{path, x, y, start, duration}]
output_path: 输出路径
Returns:
输出文件路径
"""
if not images:
import shutil
shutil.copy(video_path, output_path)
return output_path
# 构建复杂 filter_complex
inputs = ["-i", video_path]
for img in images:
inputs.extend(["-i", img["path"]])
# 链式 overlay
filter_parts = []
prev_output = "0:v"
for i, img in enumerate(images):
x = img.get("x", "(W-w)/2")
y = img.get("y", "(H-h)/2")
start = img.get("start", 0)
duration = img.get("duration", 999)
enable = f"enable='between(t,{start},{start + duration})'"
if i == len(images) - 1:
out_label = "outv"
else:
out_label = f"tmp{i}"
filter_parts.append(
f"[{prev_output}][{i+1}:v]overlay=x={x}:y={y}:{enable}[{out_label}]"
)
prev_output = out_label
filter_complex = ";".join(filter_parts)
cmd = [FFMPEG_PATH, "-y"] + inputs + [
"-filter_complex", filter_complex,
"-map", "[outv]",
"-map", "0:a?",
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "copy",
"-pix_fmt", "yuv420p",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Overlaid {len(images)} images")
return output_path
def mix_audio(
video_path: str,
audio_path: str,
output_path: str,
audio_volume: float = 1.0,
video_volume: float = 0.1,
audio_start: float = 0
) -> str:
"""
混合音频到视频旁白、BGM等
Args:
video_path: 输入视频路径
audio_path: 音频文件路径
output_path: 输出路径
audio_volume: 新音频音量0-1
video_volume: 原视频音量0-1
audio_start: 音频开始时间(秒)
Returns:
输出文件路径
"""
logger.info(f"Mixing audio: {audio_path}")
# 检查视频是否有音频轨道
info = get_video_info(video_path)
video_duration = info["duration"]
# 构建 filter_complex
# adelay 用于延迟音频开始时间(毫秒)
delay_ms = int(audio_start * 1000)
filter_complex = (
f"[0:a]volume={video_volume}[va];"
f"[1:a]adelay={delay_ms}|{delay_ms},volume={audio_volume}[aa];"
f"[va][aa]amix=inputs=2:duration=longest:dropout_transition=0:normalize=0[outa]"
)
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-i", audio_path,
"-filter_complex", filter_complex,
"-map", "0:v",
"-map", "[outa]",
"-c:v", "copy",
"-c:a", "aac",
"-b:a", "192k",
output_path
]
try:
_run_ffmpeg(cmd)
except subprocess.CalledProcessError:
# 如果原视频没有音频,直接添加新音频
logger.warning("Video has no audio track, adding audio directly")
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-i", audio_path,
"-map", "0:v",
"-map", "1:a",
"-c:v", "copy",
"-c:a", "aac",
"-b:a", "192k",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Audio mixed: {output_path}")
return output_path
def add_bgm(
video_path: str,
bgm_path: str,
output_path: str,
bgm_volume: float = 0.06,
loop: bool = True,
ducking: bool = True,
duck_gain_db: float = -6.0,
fade_in: float = 1.0,
fade_out: float = 1.0
) -> str:
"""
添加背景音乐(自动循环以匹配视频长度)
Args:
video_path: 输入视频路径
bgm_path: BGM文件路径
output_path: 输出路径
bgm_volume: BGM音量
loop: 是否循环BGM
"""
info = get_video_info(video_path)
video_duration = info["duration"]
if loop:
bgm_chain = (
f"[1:a]aloop=-1:size=2e+09,asetpts=N/SR/TB,"
f"atrim=0:{video_duration},"
f"afade=t=in:st=0:d={fade_in},"
f"afade=t=out:st={max(video_duration - fade_out, 0)}:d={fade_out},"
f"volume={bgm_volume}[bgm]"
)
else:
bgm_chain = (
f"[1:a]"
f"afade=t=in:st=0:d={fade_in},"
f"afade=t=out:st={max(video_duration - fade_out, 0)}:d={fade_out},"
f"volume={bgm_volume}[bgm]"
)
if ducking:
# 使用安全参数的 sidechaincompress避免 unsupported 参数
filter_complex = (
f"{bgm_chain};"
f"[0:a][bgm]sidechaincompress=threshold=0.1:ratio=4:attack=5:release=250:makeup=1:mix=1:level_in=1:level_sc=1[outa]"
)
else:
filter_complex = f"{bgm_chain};[0:a][bgm]amix=inputs=2:duration=first[outa]"
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-stream_loop", "-1" if loop else "0",
"-i", bgm_path,
"-filter_complex", filter_complex,
"-map", "0:v",
"-map", "[outa]",
"-c:v", "copy",
"-c:a", "aac",
"-b:a", "192k",
"-t", str(video_duration),
output_path
]
try:
_run_ffmpeg(cmd)
except subprocess.CalledProcessError:
# sidechain失败时回退为 amix保留原有音频 + 低音量BGM
logger.warning("Sidechain failed, fallback to simple amix for BGM")
filter_complex = f"{bgm_chain};[0:a][bgm]amix=inputs=2:duration=first[outa]"
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-stream_loop", "-1" if loop else "0",
"-i", bgm_path,
"-filter_complex", filter_complex,
"-map", "0:v",
"-map", "[outa]",
"-c:v", "copy",
"-c:a", "aac",
"-b:a", "192k",
"-t", str(video_duration),
output_path
]
_run_ffmpeg(cmd)
logger.info(f"BGM added: {output_path}")
return output_path
def trim_video(
video_path: str,
output_path: str,
start: float = 0,
duration: float = None,
end: float = None
) -> str:
"""
裁剪视频
Args:
video_path: 输入视频路径
output_path: 输出路径
start: 开始时间(秒)
duration: 持续时间(秒)
end: 结束时间(秒),与 duration 二选一
"""
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-ss", str(start)
]
if duration:
cmd.extend(["-t", str(duration)])
elif end:
cmd.extend(["-to", str(end)])
cmd.extend([
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "copy",
output_path
])
_run_ffmpeg(cmd)
logger.info(f"Trimmed video: {start}s - {end or start + duration}s")
return output_path
def speed_up_video(
video_path: str,
output_path: str,
speed: float = 1.5
) -> str:
"""
加速/减速视频
Args:
video_path: 输入视频路径
output_path: 输出路径
speed: 速度倍率(>1 加速,<1 减速)
"""
# setpts 控制视频速度atempo 控制音频速度
video_filter = f"setpts={1/speed}*PTS"
# atempo 只支持 0.5-2.0,超出需要链式处理
if speed > 2.0:
audio_filter = "atempo=2.0,atempo=" + str(speed / 2.0)
elif speed < 0.5:
audio_filter = "atempo=0.5,atempo=" + str(speed / 0.5)
else:
audio_filter = f"atempo={speed}"
cmd = [
FFMPEG_PATH, "-y",
"-i", video_path,
"-vf", video_filter,
"-af", audio_filter,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "aac",
output_path
]
_run_ffmpeg(cmd)
logger.info(f"Speed changed to {speed}x: {output_path}")
return output_path

491
modules/image_gen.py Normal file
View File

@@ -0,0 +1,491 @@
"""
连贯生图模块 (Volcengine Doubao)
负责根据分镜脚本和原始素材生成一系列连贯的分镜图片
"""
import base64
import logging
import os
import time
import requests
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from PIL import Image
import io
from modules import storage
import config
logger = logging.getLogger(__name__)
class ImageGenerator:
"""连贯图片生成器 (Volcengine Provider)"""
def __init__(self):
self.api_key = config.VOLC_API_KEY
# Endpoint: https://ark.cn-beijing.volces.com/api/v3/images/generations
self.endpoint = f"https://ark.cn-beijing.volces.com/api/v3/images/generations"
self.model = config.IMAGE_MODEL_ID
def _encode_image(self, image_path: str) -> str:
"""读取图片,调整大小并转为 Base64"""
try:
with Image.open(image_path) as img:
if img.mode != 'RGB':
img = img.convert('RGB')
max_size = 1024
if max(img.size) > max_size:
img.thumbnail((max_size, max_size), Image.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=80)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
except Exception as e:
logger.error(f"Error processing image {image_path}: {e}")
return ""
def generate_single_scene_image(
self,
scene: Dict[str, Any],
original_image_path: Any,
previous_image_path: Optional[str] = None,
model_provider: str = "shubiaobiao", # "shubiaobiao", "gemini", "doubao"
visual_anchor: str = "" # 视觉锚点,强制拼接到 prompt 前
) -> Optional[str]:
"""
生成单张分镜图片 (Public)
"""
scene_id = scene["id"]
visual_prompt = scene.get("visual_prompt", "")
# 强制拼接 Visual Anchor (确保生图一致性)
if visual_anchor and visual_anchor not in visual_prompt:
visual_prompt = f"[{visual_anchor}] {visual_prompt}"
logger.info(f"Scene {scene_id}: Prepended visual_anchor to prompt")
logger.info(f"Generating image for Scene {scene_id} (Provider: {model_provider})...")
input_images = []
# Handle original_image_path (can be str or list)
if isinstance(original_image_path, list):
input_images.extend(original_image_path)
elif isinstance(original_image_path, str) and original_image_path:
input_images.append(original_image_path)
if previous_image_path:
input_images.append(previous_image_path)
try:
output_path = self._generate_single_image(
prompt=visual_prompt,
reference_images=input_images,
output_filename=f"scene_{scene_id}_{int(time.time())}.png",
provider=model_provider
)
if output_path:
return output_path
else:
raise RuntimeError(f"Image generation returned empty for Scene {scene_id}")
except PermissionError as e:
logger.error(f"Critical API Error for Scene {scene_id}: {e}")
raise e
except Exception as e:
logger.error(f"Image generation failed for Scene {scene_id}: {e}")
raise e
def generate_group_images_doubao(
self,
scenes: List[Dict[str, Any]],
reference_images: List[str],
visual_anchor: str = "" # 视觉锚点
) -> Dict[int, str]:
"""
Doubao 组图生成 (Batch) - 拼接 Prompt 一次生成多张
"""
logger.info("Starting Doubao Group Image Generation...")
# 1. 拼接 Prompts
# 格式: "Global: [Visual Anchor] ... | S1: ... | S2: ..."
scene_prompts = []
for scene in scenes:
# 提取分镜 Visual Prompt
p = scene.get("visual_prompt", "")
scene_prompts.append(f"S{scene['id']}:{p}")
combined_scenes_text = " | ".join(scene_prompts)
# 构造 Combined Prompt - 将 visual_anchor 放入 Global 部分
global_context = f"[{visual_anchor}] Consistent product appearance & style." if visual_anchor else "Consistent product appearance & style."
combined_prompt = (
f"Global: {global_context}\n"
f"{combined_scenes_text}\n"
"Req: 1 img per scene. Follow specific angles."
)
logger.info(f"Visual Anchor applied to group prompt: {visual_anchor[:50]}..." if visual_anchor else "No visual_anchor")
# 记录 Prompt 长度供参考
logger.info(f"Doubao Group Prompt Length: {len(combined_prompt)} chars")
# 2. 准备 payload
payload = {
"model": config.DOUBAO_IMG_MODEL,
"prompt": combined_prompt,
"sequential_image_generation": "auto", # 开启组图
"sequential_image_generation_options": {
"max_images": len(scenes) # 限制最大张数
},
"response_format": "url",
"size": "1440x2560",
"stream": False,
"watermark": False
}
# 3. 处理参考图
img_urls = []
if reference_images:
for ref_path in reference_images:
if os.path.exists(ref_path):
try:
url = storage.upload_file(ref_path)
if url: img_urls.append(url)
except Exception as e:
logger.warning(f"Failed to upload ref image {ref_path}: {e}")
if img_urls:
payload["image_urls"] = img_urls
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
try:
logger.info(f"Submitting Doubao Group Request (Scenes: {len(scenes)})...")
resp = requests.post(self.endpoint, json=payload, headers=headers, timeout=240)
resp.raise_for_status()
data = resp.json()
results = {}
if "data" in data:
items = data["data"]
logger.info(f"Doubao returned {len(items)} images.")
# 尝试将返回的图片映射回 Scene
# 假设顺序一致
for i, item in enumerate(items):
if i < len(scenes):
scene_id = scenes[i]["id"]
image_url = item.get("url")
if image_url:
# Download
img_resp = requests.get(image_url, timeout=60)
output_path = config.TEMP_DIR / f"scene_{scene_id}_{int(time.time())}.png"
with open(output_path, "wb") as f:
f.write(img_resp.content)
results[scene_id] = str(output_path)
return results
except Exception as e:
logger.error(f"Doubao Group Generation Failed: {e}")
raise e
def _generate_single_image(
self,
prompt: str,
reference_images: List[str],
output_filename: str,
provider: str = "shubiaobiao"
) -> Optional[str]:
"""统一入口"""
if provider == "doubao":
return self._generate_single_image_doubao(prompt, reference_images, output_filename)
elif provider == "gemini":
return self._generate_single_image_gemini(prompt, reference_images, output_filename)
else:
return self._generate_single_image_shubiao(prompt, reference_images, output_filename)
def _generate_single_image_doubao(
self,
prompt: str,
reference_images: List[str],
output_filename: str
) -> Optional[str]:
"""调用 Volcengine Doubao (Image API)"""
# 1. Upload all reference images to R2
img_urls = []
if reference_images:
for ref_path in reference_images:
if os.path.exists(ref_path):
try:
url = storage.upload_file(ref_path)
if url:
img_urls.append(url)
logger.info(f"Uploaded Doubao ref image: {url}")
except Exception as e:
logger.warning(f"Failed to upload Doubao ref image {ref_path}: {e}")
payload = {
"model": config.DOUBAO_IMG_MODEL,
"prompt": prompt,
"sequential_image_generation": "disabled",
"response_format": "url",
"size": "1440x2560",
"stream": False,
"watermark": False
}
if img_urls:
payload["image_urls"] = img_urls
logger.info(f"Doubao Image Payload: prompt='{prompt[:20]}...', image_urls={len(img_urls)}")
else:
logger.info(f"Doubao Image Payload: prompt='{prompt[:20]}...', no reference images")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
try:
logger.info(f"Submitting to Doubao Image: {self.endpoint}")
resp = requests.post(self.endpoint, json=payload, headers=headers, timeout=180)
if resp.status_code != 200:
msg = f"Doubao Image Failed ({resp.status_code}): {resp.text}"
logger.error(msg)
raise RuntimeError(msg)
data = resp.json()
if "data" in data and len(data["data"]) > 0:
image_url = data["data"][0].get("url")
if image_url:
img_resp = requests.get(image_url, timeout=60)
img_resp.raise_for_status()
output_path = config.TEMP_DIR / output_filename
with open(output_path, "wb") as f:
f.write(img_resp.content)
return str(output_path)
raise RuntimeError(f"No image URL in Doubao response: {data}")
except Exception as e:
logger.error(f"Doubao Gen Failed: {e}")
raise e
def _generate_single_image_shubiao(
self,
prompt: str,
reference_images: List[str],
output_filename: str
) -> Optional[str]:
"""调用 api2img.shubiaobiao.com 通道生成图片(同步返回 base64"""
# 准备参考图,内联 base64 方式
parts = [{"text": prompt}]
# 严格过滤和排序参考图
valid_refs = []
if reference_images:
for p in reference_images:
if p and os.path.exists(p) and p not in valid_refs:
valid_refs.append(p)
logger.info(f"[Shubiaobiao] Input reference images ({len(valid_refs)}): {valid_refs}")
if valid_refs:
for ref_path in valid_refs:
try:
encoded = self._encode_image(ref_path)
if encoded:
parts.append({
"inlineData": {
"mimeType": "image/jpeg",
"data": encoded
}
})
except Exception as e:
logger.error(f"Failed to encode image {ref_path}: {e}")
logger.info(f"[Shubiaobiao] Final payload parts count: {len(parts)} (1 prompt + {len(parts)-1} images)")
payload = {
"contents": [{
"role": "user",
"parts": parts
}],
"generationConfig": {
"responseModalities": ["IMAGE"],
"imageConfig": {
"aspectRatio": "9:16",
"imageSize": "2K"
}
}
}
endpoint = f"{config.SHUBIAOBIAO_IMG_BASE_URL}/v1beta/models/{config.SHUBIAOBIAO_IMG_MODEL_NAME}:generateContent"
headers = {
"x-goog-api-key": config.SHUBIAOBIAO_IMG_KEY,
"Content-Type": "application/json"
}
try:
logger.info(f"Submitting to Shubiaobiao Img: {endpoint}")
resp = requests.post(endpoint, json=payload, headers=headers, timeout=120)
if resp.status_code != 200:
msg = f"Shubiaobiao 提交失败 ({resp.status_code}): {resp.text}"
logger.error(msg)
raise RuntimeError(msg)
data = resp.json()
# 查找 base64 图像
img_b64 = None
candidates = data.get("candidates") or []
if candidates:
content_parts = candidates[0].get("content", {}).get("parts", [])
for part in content_parts:
inline = part.get("inlineData") if isinstance(part, dict) else None
if inline and inline.get("data"):
img_b64 = inline["data"]
break
if not img_b64:
msg = f"Shubiaobiao 响应缺少图片数据: {data}"
logger.error(msg)
raise RuntimeError(msg)
output_path = config.TEMP_DIR / output_filename
with open(output_path, "wb") as f:
f.write(base64.b64decode(img_b64))
logger.info(f"Shubiaobiao Generation Success: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Shubiaobiao Generation Exception: {e}")
raise
def _generate_single_image_gemini(
self,
prompt: str,
reference_images: List[str],
output_filename: str
) -> Optional[str]:
"""调用 Gemini (Wuyin Keji / NanoBanana-Pro) 生成单张图片"""
# 1. 构造 Payload
payload = {
"prompt": prompt,
"aspectRatio": "9:16",
"imageSize": "2K"
}
# 处理参考图 (Image-to-Image)
if reference_images:
valid_paths = []
seen = set()
for p in reference_images:
if p and os.path.exists(p) and p not in seen:
valid_paths.append(p)
seen.add(p)
if valid_paths:
img_urls = []
for ref_path in valid_paths:
try:
url = storage.upload_file(ref_path)
if url:
img_urls.append(url)
logger.info(f"Uploaded ref image: {url}")
except Exception as e:
logger.warning(f"Error uploading ref image {ref_path}: {e}")
if img_urls:
payload["img_url"] = img_urls
logger.info(f"Using {len(img_urls)} reference images for Gemini Img2Img")
headers = {
"Authorization": config.GEMINI_IMG_KEY,
"Content-Type": "application/json;charset:utf-8"
}
# 2. 提交任务
try:
logger.info(f"Submitting to Gemini: {config.GEMINI_IMG_API_URL}")
resp = requests.post(config.GEMINI_IMG_API_URL, json=payload, headers=headers, timeout=30)
if resp.status_code != 200:
msg = f"Gemini 提交失败 ({resp.status_code}): {resp.text}"
logger.error(msg)
raise RuntimeError(msg)
data = resp.json()
if data.get("code") != 200:
msg = f"Gemini 返回错误: {data}"
logger.error(msg)
raise RuntimeError(msg)
task_id = data.get("data", {}).get("id")
if not task_id:
raise RuntimeError(f"Gemini 响应缺少 task id: {data}")
logger.info(f"Gemini Task Submitted, ID: {task_id}")
# 3. 轮询状态
max_retries = 60
for i in range(max_retries):
time.sleep(2)
poll_url = f"{config.GEMINI_IMG_DETAIL_URL}?key={config.GEMINI_IMG_KEY}&id={task_id}"
try:
poll_resp = requests.get(poll_url, headers=headers, timeout=30)
except requests.Timeout:
continue
except Exception as e:
continue
if poll_resp.status_code != 200:
continue
poll_data = poll_resp.json()
if poll_data.get("code") != 200:
raise RuntimeError(f"Gemini 轮询返回错误: {poll_data}")
result_data = poll_data.get("data", {}) or {}
status = result_data.get("status") # 0:排队, 1:生成中, 2:成功, 3:失败
if status == 2:
image_url = result_data.get("image_url")
if not image_url:
raise RuntimeError("Gemini 成功但缺少 image_url")
logger.info(f"Gemini Generation Success: {image_url}")
img_resp = requests.get(image_url, timeout=60)
img_resp.raise_for_status()
output_path = config.TEMP_DIR / output_filename
with open(output_path, "wb") as f:
f.write(img_resp.content)
return str(output_path)
if status == 3:
fail_reason = result_data.get("fail_reason", "Unknown")
raise RuntimeError(f"Gemini 生成失败: {fail_reason}")
raise RuntimeError("Gemini 生成超时")
except Exception as e:
logger.error(f"Gemini Generation Exception: {e}")
raise

60
modules/ingest.py Normal file
View File

@@ -0,0 +1,60 @@
"""
MatchMe Studio - Ingest Module (Video Processing)
"""
import cv2
import os
import logging
from pathlib import Path
from typing import List, Tuple
import config
from modules import storage
logger = logging.getLogger(__name__)
def process_uploaded_video(video_path: str) -> Tuple[List[str], str]:
"""
Process uploaded video:
1. Upload raw video to R2.
2. Extract 3 keyframes (10%, 50%, 90%).
3. Return local frame paths and R2 video URL.
"""
if not Path(video_path).exists():
raise FileNotFoundError(f"Video not found: {video_path}")
logger.info(f"Processing video: {video_path}")
# 1. Upload to R2
video_url = storage.upload_file(video_path)
if not video_url:
raise RuntimeError("Failed to upload video to R2")
# 2. Extract Frames
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise IOError(f"Cannot open video: {video_path}")
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_indices = [
int(total_frames * 0.1),
int(total_frames * 0.5),
int(total_frames * 0.9)
]
frame_urls = []
for i, idx in enumerate(frame_indices):
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
frame_name = f"frame_{Path(video_path).stem}_{i}.jpg"
frame_path = config.TEMP_DIR / frame_name
cv2.imwrite(str(frame_path), frame)
# Upload frame to R2 immediately
frame_url = storage.upload_file(str(frame_path))
if frame_url:
frame_urls.append(frame_url)
cap.release()
logger.info(f"Extracted and uploaded {len(frame_urls)} frames")
return frame_urls, video_url

151
modules/project.py Normal file
View File

@@ -0,0 +1,151 @@
"""
MatchMe Studio - Project State Management (R2 Persistence)
"""
import json
import logging
import uuid
from datetime import datetime
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict, field
import config
from modules import storage
logger = logging.getLogger(__name__)
@dataclass
class Scene:
id: int
duration: int = 5
timeline: str = ""
keyframe: Dict[str, str] = field(default_factory=dict)
camera_movement: str = ""
story_beat: str = ""
voiceover: str = ""
rhythm: Dict[str, Any] = field(default_factory=dict)
image_url: str = ""
video_url: str = ""
@dataclass
class Project:
id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
created_at: str = field(default_factory=lambda: datetime.now().isoformat())
status: str = "draft" # draft | analyzing | scripting | imaging | video | rendering | done
# Step 0: Input
input_mode: str = "" # text | images | video
prompt: str = ""
image_urls: List[str] = field(default_factory=list)
video_url: str = ""
asr_text: str = ""
# Step 1: Analysis
analysis: str = ""
questions: List[Dict[str, Any]] = field(default_factory=list)
answers: Dict[str, str] = field(default_factory=dict)
# Step 2: Script
hook: str = ""
scenes: List[Dict[str, Any]] = field(default_factory=list)
cta: str = ""
# Step 6: Final
final_video_url: str = ""
bgm_url: str = ""
def save_project(project: Project) -> str:
"""Save project state to R2 as JSON."""
data = asdict(project)
json_str = json.dumps(data, ensure_ascii=False, indent=2)
# Write to temp file
temp_path = config.TEMP_DIR / f"project_{project.id}.json"
with open(temp_path, "w", encoding="utf-8") as f:
f.write(json_str)
# Upload to R2
object_name = f"projects/{project.id}.json"
s3 = storage.get_s3_client()
try:
s3.upload_file(
str(temp_path),
config.R2_BUCKET_NAME,
object_name,
ExtraArgs={'ContentType': 'application/json'}
)
logger.info(f"Project {project.id} saved to R2")
return project.id
except Exception as e:
logger.error(f"Failed to save project: {e}")
raise
def load_project(project_id: str) -> Optional[Project]:
"""Load project state from R2."""
object_name = f"projects/{project_id}.json"
temp_path = config.TEMP_DIR / f"project_{project_id}.json"
s3 = storage.get_s3_client()
try:
s3.download_file(config.R2_BUCKET_NAME, object_name, str(temp_path))
with open(temp_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Reconstruct Project
project = Project(
id=data.get("id", project_id),
created_at=data.get("created_at", ""),
status=data.get("status", "draft"),
input_mode=data.get("input_mode", ""),
prompt=data.get("prompt", ""),
image_urls=data.get("image_urls", []),
video_url=data.get("video_url", ""),
asr_text=data.get("asr_text", ""),
analysis=data.get("analysis", ""),
questions=data.get("questions", []),
answers=data.get("answers", {}),
hook=data.get("hook", ""),
scenes=data.get("scenes", []),
cta=data.get("cta", ""),
final_video_url=data.get("final_video_url", ""),
bgm_url=data.get("bgm_url", "")
)
logger.info(f"Project {project_id} loaded from R2")
return project
except Exception as e:
logger.warning(f"Failed to load project {project_id}: {e}")
return None
def create_project() -> Project:
"""Create a new project with unique ID."""
project = Project()
logger.info(f"Created new project: {project.id}")
return project

390
modules/script_gen.py Normal file
View File

@@ -0,0 +1,390 @@
"""
脚本生成模块 (Gemini-3-Pro)
负责解析商品信息,生成分镜脚本
"""
import base64
import json
import logging
import os
import requests
from typing import Dict, Any, List, Optional
from pathlib import Path
import config
from modules.db_manager import db
logger = logging.getLogger(__name__)
class ScriptGenerator:
"""分镜脚本生成器"""
def __init__(self):
self.api_key = config.SHUBIAOBIAO_KEY
# 注意API 地址可能需要适配 gemini-3-pro-preview 的具体路径
# 根据 demo: https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent
# 这里我们先假设 base_url 是 v1beta/models/
self.endpoint = "https://api.shubiaobiao.cn/v1beta/models/gemini-3-pro-preview:generateContent"
# Default System Prompt
self.default_system_prompt = """
你是一个专业的抖音电商短视频导演。请根据提供的商品信息和图片,设计一个高转化率的商品详情页首图视频脚本。
## 目标
- 提升商品详情页的 GPM 和下单转化率
- 视频时长 9-12 秒 (由 3-4 个分镜组成)
- **每个分镜时长固定为 3 秒** (duration: 3),不要超过 3 秒
- 必须包含:目标人群分析、卖点提炼、分镜设计
## 分镜设计原则
1. **单分镜单主体**:每个分镜聚焦一个视觉主体或动作,避免复杂运镜,因为 AI 生视频在长时间(>3秒容易出现画面异常。
2. **旁白跨分镜**:一段完整的旁白/卖点可以跨越多个分镜。在 voiceover_timeline 中,通过 start_time 和 duration (秒) 控制旁白的绝对时间位置,无需与分镜一一对应。
3. **节奏感**:分镜之间保持视觉连贯,通过景别变化(特写 -> 中景 -> 全景)制造节奏。
4. **语速控制**:旁白语速约 4 字/秒12字旁白约需 3 秒。
## 输出格式要求 (JSON)
必须严格遵守以下 JSON 结构:
{
"product_name": "商品名称",
"visual_anchor": "商品视觉锚点:材质+颜色+形状+包装特征(用于保持生图一致性)",
"selling_points": ["卖点1", "卖点2"],
"target_audience": "目标人群描述",
"video_style": "视频风格关键词",
"bgm_style": "BGM风格关键词",
"voiceover_timeline": [
{
"id": 1,
"text": "旁白文案片段1可横跨多个分镜",
"subtitle": "字幕文案1 (简短有力)",
"start_time": 0.0,
"duration": 3.0
},
{
"id": 2,
"text": "旁白文案片段2",
"subtitle": "字幕文案2",
"start_time": 3.5,
"duration": 2.5
}
],
"scenes": [
{
"id": 1,
"duration": 3,
"visual_prompt": "详细的画面描述用于AI生图包含主体、背景、构图、光影。英文描述。",
"video_prompt": "详细的动效描述用于AI图生视频。英文描述。",
"fancy_text": {
"text": "花字文案 (最多6字)",
"style": "highlight",
"position": "center",
"start_time": 0.5,
"duration": 2.0
}
}
]
}
## 注意事项
1. **visual_prompt**:
- 必须是英文。
- 描述要具体,例如 "Close-up shot of a hair clip, soft lighting, minimalist background".
- **CRITICAL**: 禁止 AI 额外生成装饰性文字、标语、水印。但必须保留商品包装自带的文字和 Logo这是商品真实外观的一部分
- 正确写法: "Product front view, keep original packaging design --no added text --no watermarks"
- **EMPHASIS**: Strictly follow the appearance of the product in the reference images.
2. **video_prompt**: 必须是英文,描述动作,例如 "Slow zoom in, the hair clip rotates slightly"。注意保持动作简单,避免复杂运镜和人体动作。
3. **voiceover_timeline**:
- 这是整个视频的旁白和字幕时间轴,独立于分镜。
- `start_time` 是旁白开始的绝对时间 (秒)`duration` 是旁白持续时长 (秒)。
- **一段旁白可以横跨多个分镜**,例如:总时长 9 秒 (3 个分镜),一段旁白从 start_time=0duration=5则覆盖前两个分镜。
- 两段旁白之间留 0.3-0.5 秒间隙(气口)。
4. **fancy_text**:
- 花字要精简(最多 6 字),突出卖点。
- **Style Selection**:
- `highlight`: 默认样式,适合通用卖点 (Yellow/Black)。
- `warning`: 强调痛点或食欲 (Red/White)。
- `price`: 价格显示 (Big Red)。
- `bubble`: 旁白补充或用户评价 (Bubble)。
- `minimal`: 高级感,适合时尚类 (Thin/White)。
- `tech`: 数码类 (Cyan/Glow)。
- `position` 默认 `center`,可选 top/bottom/top-left/bottom-right 等。
5. **场景连贯性**: 确保分镜之间的逻辑和视觉风格连贯。每个分镜 duration 必须为 3。
"""
def _encode_image(self, image_path: str) -> str:
"""读取图片并转为 Base64"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def generate_script(
self,
product_name: str,
product_info: Dict[str, Any],
image_paths: List[str] = None,
model_provider: str = "shubiaobiao" # "shubiaobiao" or "doubao"
) -> Dict[str, Any]:
"""
生成分镜脚本
"""
logger.info(f"Generating script for: {product_name} (Provider: {model_provider})")
# 1. 构造 Prompt (优先从数据库读取配置)
system_prompt = db.get_config("prompt_script_gen", self.default_system_prompt)
user_prompt = self._build_user_prompt(product_name, product_info)
# Branch for Doubao
if model_provider == "doubao":
return self._generate_script_doubao(system_prompt, user_prompt, image_paths)
# ... Existing Shubiaobiao Logic ...
# 调试: 检查是否使用了自定义 Prompt
if system_prompt != self.default_system_prompt:
logger.info("Using CUSTOM system prompt from database")
else:
logger.info("Using DEFAULT system prompt")
# 2. 构造请求 Payload (Gemini/Shubiaobiao)
contents = []
# User message parts
user_parts = [{"text": user_prompt}]
# 添加图片 (Multimodal input)
if image_paths:
for path in image_paths[:10]: # 限制10张Gemini-3-Pro 支持多图
if Path(path).exists():
try:
b64_img = self._encode_image(path)
user_parts.append({
"inline_data": {
"mime_type": "image/jpeg", # 假设是 JPG/PNG
"data": b64_img
}
})
except Exception as e:
logger.warning(f"Failed to encode image {path}: {e}")
contents.append({
"role": "user",
"parts": user_parts
})
# System instruction (Gemini 支持 system instruction 或者是放在 user prompt 前)
user_parts.insert(0, {"text": system_prompt})
payload = {
"contents": contents,
"generationConfig": {
"response_mime_type": "application/json",
"temperature": 0.7
}
}
headers = {
"x-goog-api-key": self.api_key,
"Content-Type": "application/json"
}
# 3. 调用 API
try:
response = requests.post(self.endpoint, headers=headers, json=payload, timeout=60)
response.raise_for_status()
result = response.json()
# 4. 解析结果
if "candidates" in result and result["candidates"]:
content_text = result["candidates"][0]["content"]["parts"][0]["text"]
# 提取 JSON 部分 (处理 Markdown 代码块或纯文本)
script_json = self._extract_json_from_response(content_text)
if script_json is None:
logger.error(f"Failed to extract JSON from response: {content_text[:500]}...")
return None
final_script = self._validate_and_fix_script(script_json)
# Add Debug Info (包含原始输出)
final_script["_debug"] = {
"system_prompt": system_prompt,
"user_prompt": user_prompt,
"raw_output": content_text,
"provider": "shubiaobiao"
}
return final_script
else:
logger.error(f"No candidates in response: {result}")
return None
except Exception as e:
logger.error(f"Script generation failed: {e}")
if 'response' in locals():
logger.error(f"Response content: {response.text}")
return None
def _generate_script_doubao(
self,
system_prompt: str,
user_prompt: str,
image_paths: List[str]
) -> Dict[str, Any]:
"""Doubao 脚本生成实现 (Multimodal)"""
# User Provided: https://ark.cn-beijing.volces.com/api/v3/responses
# But for 'responses' API, structure is specific. Let's try to match user's curl format exactly but adapting content.
# User curl uses "input": [{"role": "user", "content": [{"type": "input_image"...}, {"type": "input_text"...}]}]
endpoint = "https://ark.cn-beijing.volces.com/api/v3/chat/completions" # Recommend standard Chat API first as 'responses' is usually non-standard or older
# However, user explicitly provided /responses curl. Let's try to stick to standard Chat Completions first because Doubao Pro 1.5 is OpenAI compatible.
# If that fails or if user insists on the specific structure, we can adapt.
# Volcengine 'ep-...' models are usually served via standard /chat/completions.
# Let's try standard OpenAI format which Doubao supports perfectly.
messages = [
{"role": "system", "content": system_prompt}
]
user_content = []
# Add Images (Doubao Vision supports image_url)
if image_paths:
for path in image_paths[:5]: # Limit
if os.path.exists(path):
# For Volcengine, need to upload or use base64?
# Standard OpenAI format supports base64 data urls.
# "image_url": {"url": "data:image/jpeg;base64,..."}
try:
b64_img = self._encode_image(path)
user_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64_img}"
}
})
except Exception as e:
logger.warning(f"Failed to encode image for Doubao: {e}")
# Add Text
user_content.append({"type": "text", "text": user_prompt})
messages.append({
"role": "user",
"content": user_content
})
payload = {
"model": config.DOUBAO_SCRIPT_MODEL,
"messages": messages,
"stream": False,
# "response_format": {"type": "json_object"} # Try enabling JSON mode if supported
}
headers = {
"Authorization": f"Bearer {config.VOLC_API_KEY}",
"Content-Type": "application/json"
}
try:
# Try standard chat/completions first
resp = requests.post(endpoint, headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
# If 404, maybe endpoint is wrong, try the user's 'responses' endpoint?
# But 'responses' usually implies a different payload structure.
logger.warning(f"Doubao Chat API failed ({resp.status_code}), trying legacy/custom endpoint...")
# Fallback to user provided structure if needed (implement later if this fails)
resp.raise_for_status()
result = resp.json()
content_text = result["choices"][0]["message"]["content"]
script_json = self._extract_json_from_response(content_text)
if script_json is None:
logger.error(f"Failed to extract JSON from Doubao response: {content_text[:500]}...")
return None
final_script = self._validate_and_fix_script(script_json)
final_script["_debug"] = {
"system_prompt": system_prompt,
"user_prompt": user_prompt,
"raw_output": content_text,
"provider": "doubao"
}
return final_script
except Exception as e:
logger.error(f"Doubao script generation failed: {e}")
if 'resp' in locals():
logger.error(f"Response: {resp.text}")
return None
def _extract_json_from_response(self, text: str) -> Optional[Dict]:
"""
从 API 响应中提取 JSON 对象
支持:
1. 纯 JSON 响应
2. Markdown 代码块包裹的 JSON (```json ... ```)
3. 文本中嵌入的 JSON (找到第一个 { 和最后一个 })
"""
import re
# 方法1: 尝试直接解析(纯 JSON 情况)
try:
return json.loads(text.strip())
except json.JSONDecodeError:
pass
# 方法2: 提取 ```json ... ``` 代码块
json_block_match = re.search(r'```json\s*([\s\S]*?)\s*```', text)
if json_block_match:
try:
return json.loads(json_block_match.group(1))
except json.JSONDecodeError as e:
logger.warning(f"JSON block found but parse failed: {e}")
# 方法3: 提取 ``` ... ``` 代码块 (无 json 标记)
code_block_match = re.search(r'```\s*([\s\S]*?)\s*```', text)
if code_block_match:
try:
return json.loads(code_block_match.group(1))
except json.JSONDecodeError:
pass
# 方法4: 找到第一个 { 和最后一个 } 之间的内容
first_brace = text.find('{')
last_brace = text.rfind('}')
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
try:
return json.loads(text[first_brace:last_brace + 1])
except json.JSONDecodeError as e:
logger.warning(f"Brace extraction failed: {e}")
return None
def _build_user_prompt(self, product_name: str, product_info: Dict[str, Any]) -> str:
# 提取商家偏好提示
style_hint = product_info.get("style_hint", "")
# 过滤掉不需要展示的字段
filtered_info = {k: v for k, v in product_info.items() if k not in ["uploaded_images", "style_hint"]}
info_str = "\n".join([f"- {k}: {v}" for k, v in filtered_info.items()])
prompt = f"""
商品名称:{product_name}
商品信息:
{info_str}
"""
if style_hint:
prompt += f"""
## 商家特别要求
{style_hint}
"""
prompt += "\n请根据以上信息设计视频脚本。"
return prompt
def _validate_and_fix_script(self, script: Dict[str, Any]) -> Dict[str, Any]:
"""校验并修复脚本结构"""
# 简单校验,确保必要字段存在
if "scenes" not in script:
script["scenes"] = []
return script

84
modules/storage.py Normal file
View File

@@ -0,0 +1,84 @@
"""
MatchMe Studio - Storage Module (R2)
"""
import os
import logging
import time
import uuid
import boto3
from botocore.exceptions import NoCredentialsError
from pathlib import Path
from typing import Optional
import config
logger = logging.getLogger(__name__)
def get_s3_client():
try:
return boto3.client(
's3',
endpoint_url=config.R2_ENDPOINT,
aws_access_key_id=config.R2_ACCESS_KEY,
aws_secret_access_key=config.R2_SECRET_KEY,
region_name='auto'
)
except Exception as e:
logger.error(f"Failed to create R2 client: {e}")
raise
def upload_file(file_path: str) -> Optional[str]:
"""Upload file to R2 and return Public URL."""
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
# 使用 UUID 作为文件名,避免中文/特殊字符导致的 URL 问题
original_name = Path(file_path).name
ext = Path(file_path).suffix.lower() or ".bin"
object_name = f"{uuid.uuid4().hex}{ext}"
s3 = get_s3_client()
try:
logger.info(f"Uploading {original_name} to R2 as {object_name}...")
# 根据后缀设置正确的 Content-Type
if ext == ".png":
content_type = "image/png"
elif ext in [".jpg", ".jpeg"]:
content_type = "image/jpeg"
elif ext == ".mp4":
content_type = "video/mp4"
elif ext == ".mp3":
content_type = "audio/mpeg"
else:
content_type = "application/octet-stream"
s3.upload_file(
file_path,
config.R2_BUCKET_NAME,
object_name,
ExtraArgs={'ContentType': content_type}
)
public_url = f"{config.R2_PUBLIC_URL}/{object_name}"
logger.info(f"Upload successful: {public_url}")
return public_url
except Exception as e:
logger.error(f"R2 Upload Failed: {e}")
return None
def cleanup_temp(max_age_seconds: int = 3600):
"""Delete old temp files."""
logger.info("Running cleanup_temp...")
now = time.time()
if not config.TEMP_DIR.exists(): return
for f in config.TEMP_DIR.iterdir():
try:
if f.is_file() and (now - f.stat().st_mtime) > max_age_seconds:
f.unlink()
except Exception as e:
logger.warning(f"Failed to delete {f}: {e}")

76
modules/styles.py Normal file
View File

@@ -0,0 +1,76 @@
"""
花字样式预设库
供 Design Agent 和 Renderer 使用
"""
STYLES = {
# 1. 醒目强调 (黄色高亮)
"highlight": {
"font_size": 60,
"font_color": "#FFE66D", # 亮黄
"stroke": {"color": "#000000", "width": 4},
"shadow": {"color": "#000000", "blur": 8, "offset": [4, 4], "opacity": 0.6}
},
# 2. 警告/痛点 (红色/黑色背景)
"warning": {
"font_size": 55,
"font_color": "#FFFFFF",
"stroke": {"color": "#FF0000", "width": 0}, # 无描边
"background": {
"type": "box",
"color": "#FF4D4F", # 红色背景
"corner_radius": 12,
"padding": [15, 25, 15, 25] # t, r, b, l
},
"shadow": {"color": "#990000", "blur": 0, "offset": [0, 6], "opacity": 0.4} # 立体感阴影
},
# 3. 价格/促销 (大号红色)
"price": {
"font_size": 90,
"font_color": "#FF2E2E", # 鲜红
"stroke": {"color": "#FFFFFF", "width": 6}, # 白边
"shadow": {"color": "#FF9999", "blur": 15, "offset": [0, 0], "opacity": 0.8} # 发光效果
},
# 4. 对话/气泡 (黑字白底圆角)
"bubble": {
"font_size": 48,
"font_color": "#333333",
"background": {
"type": "box",
"color": "#FFFFFF",
"corner_radius": 40, # 大圆角
"padding": [20, 40, 20, 40]
},
"shadow": {"color": "#000000", "blur": 10, "offset": [2, 5], "opacity": 0.2}
},
# 5. 时尚/极简 (细黑体+白字)
"minimal": {
"font_size": 65,
"font_color": "#FFFFFF",
"stroke": {"color": "#000000", "width": 2},
"shadow": {"color": "#000000", "blur": 2, "offset": [2, 2], "opacity": 0.8},
"font_family": "NotoSansSC-Regular.otf" # 假设有这个字体,或者回退
},
# 6. 科技/未来 (青色+发光)
"tech": {
"font_size": 60,
"font_color": "#00FFFF",
"stroke": {"color": "#003333", "width": 3},
"shadow": {"color": "#00FFFF", "blur": 20, "offset": [0, 0], "opacity": 0.9}
}
}
def get_style(style_name: str) -> dict:
"""获取样式配置,支持回退"""
return STYLES.get(style_name, STYLES["highlight"])

251
modules/text_renderer.py Normal file
View File

@@ -0,0 +1,251 @@
"""
通用文本渲染引擎
支持原子化设计参数,供上游 Design Agent 灵活调用
"""
import os
import hashlib
import logging
from pathlib import Path
from typing import Dict, Any, List, Tuple, Union, Optional
from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageColor
import config
from modules.styles import get_style
logger = logging.getLogger(__name__)
# 缓存目录
CACHE_DIR = config.TEMP_DIR / "text_renderer_cache"
CACHE_DIR.mkdir(exist_ok=True)
class TextRenderer:
"""
通用文本渲染器
基于原子化参数渲染文本图片 (PNG)
"""
def __init__(self):
self.default_font_path = self._resolve_font_path(None)
def _resolve_font_path(self, font_family: Optional[str]) -> str:
"""解析字体路径,支持多级回退"""
candidates = []
if font_family:
# 1. 尝试作为绝对路径
candidates.append(font_family)
# 2. 尝试在 assets/fonts 下查找
candidates.append(str(config.FONTS_DIR / font_family))
if not font_family.endswith(".ttf") and not font_family.endswith(".otf"):
candidates.append(str(config.FONTS_DIR / f"{font_family}.ttf"))
candidates.append(str(config.FONTS_DIR / f"{font_family}.otf"))
# 3. 预设项目字体
candidates.extend([
str(config.FONTS_DIR / "SmileySans-Oblique.ttf"),
str(config.FONTS_DIR / "AlibabaPuHuiTi-Bold.ttf"),
str(config.FONTS_DIR / "AlibabaPuHuiTi-Regular.ttf"),
str(config.FONTS_DIR / "NotoSansSC-Bold.otf"), # 假如有效
])
# 4. 系统字体回退
candidates.extend([
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/STHeiti Medium.ttc",
"C:/Windows/Fonts/msyh.ttc",
"C:/Windows/Fonts/simhei.ttf",
])
for path in candidates:
if path and os.path.exists(path):
# 简单验证文件大小
try:
if os.path.getsize(path) > 10000:
return path
except:
continue
logger.warning("No valid font found, using default load_default()")
return None
def _get_font(self, font_path: str, size: int) -> ImageFont.FreeTypeFont:
try:
if font_path:
return ImageFont.truetype(font_path, size)
except Exception as e:
logger.warning(f"Failed to load font {font_path}: {e}")
return ImageFont.load_default()
def _parse_color(self, color: Union[str, Tuple]) -> Tuple[int, int, int, int]:
"""解析颜色为 RGBA"""
if isinstance(color, str):
if color.startswith("#"):
rgb = ImageColor.getrgb(color)
return rgb + (255,)
# TODO: 支持 'rgba(r,g,b,a)' 格式
if isinstance(color, tuple):
if len(color) == 3:
return color + (255,)
return color
return (0, 0, 0, 255)
def render(self, text: str, style: Union[Dict[str, Any], str], cache: bool = True) -> str:
"""
渲染文本并返回图片路径
style 结构:
{
"font_family": str,
"font_size": int,
"font_color": str,
"stroke": [{"color": str, "width": int}, ...],
"shadow": {"color": str, "blur": int, "offset": [x, y], "opacity": float},
"background": {
"type": "box", "color": str/list, "corner_radius": int, "padding": [t, r, b, l]
}
}
"""
# 0. 解析样式
if isinstance(style, str):
style = get_style(style)
# 1. 缓存检查
cache_key = hashlib.md5(f"{text}_{str(style)}".encode()).hexdigest()
if cache:
cache_path = CACHE_DIR / f"{cache_key}.png"
if cache_path.exists():
return str(cache_path)
# 2. 解析基本参数
font_path = self._resolve_font_path(style.get("font_family"))
font_size = style.get("font_size", 60)
font = self._get_font(font_path, font_size)
font_color = self._parse_color(style.get("font_color", "#FFFFFF"))
# 3. 测量文本尺寸
dummy_draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))
bbox = dummy_draw.textbbox((0, 0), text, font=font)
text_w = bbox[2] - bbox[0]
text_h = bbox[3] - bbox[1]
# 4. 计算总尺寸 (包含 padding, stroke, shadow)
strokes = style.get("stroke", [])
if isinstance(strokes, dict): strokes = [strokes] # 兼容旧格式
max_stroke = 0
for s in strokes:
max_stroke = max(max_stroke, s.get("width", 0))
shadow = style.get("shadow", {})
shadow_blur = shadow.get("blur", 0)
shadow_offset = shadow.get("offset", [0, 0])
bg = style.get("background", {})
padding = bg.get("padding", [0, 0, 0, 0])
if isinstance(padding, int): padding = [padding] * 4
if len(padding) == 2: padding = [padding[0], padding[1], padding[0], padding[1]] # v, h -> t, r, b, l
# 内容区域尺寸 (文本 + padding)
content_w = text_w + padding[1] + padding[3]
content_h = text_h + padding[0] + padding[2]
# 扩展区域 (描边 + 阴影)
extra_margin = max_stroke + shadow_blur + max(abs(shadow_offset[0]), abs(shadow_offset[1])) + 10
canvas_w = content_w + extra_margin * 2
canvas_h = content_h + extra_margin * 2
# 5. 创建画布
img = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
# 锚点位置 (文本中心点)
center_x = canvas_w // 2
center_y = canvas_h // 2
# 6. 绘制顺序: 阴影 -> 背景 -> 描边 -> 文本
# --- 绘制阴影 (针对整个块) ---
if shadow:
shadow_color = self._parse_color(shadow.get("color", "#000000"))
opacity = shadow.get("opacity", 0.5)
shadow_color = (shadow_color[0], shadow_color[1], shadow_color[2], int(255 * opacity))
# 临时画布绘制形状用于生成阴影
shadow_layer = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
shadow_draw = ImageDraw.Draw(shadow_layer)
# 如果有背景,阴影跟随背景形状;否则跟随文字
if bg and bg.get("type") != "none":
self._draw_background(shadow_draw, bg, center_x, center_y, content_w, content_h, shadow_color)
else:
# 文字阴影
txt_x = center_x - text_w / 2
txt_y = center_y - text_h / 2
shadow_draw.text((txt_x, txt_y), text, font=font, fill=shadow_color)
# 描边阴影
for s in strokes:
width = s.get("width", 0)
# 简单模拟描边阴影:多次绘制
# (略: 完整描边阴影开销大,暂只做文字阴影)
# 应用模糊
if shadow_blur > 0:
shadow_layer = shadow_layer.filter(ImageFilter.GaussianBlur(shadow_blur))
# 应用偏移
final_shadow = Image.new("RGBA", (int(canvas_w), int(canvas_h)), (0, 0, 0, 0))
final_shadow.paste(shadow_layer, (int(shadow_offset[0]), int(shadow_offset[1])), mask=shadow_layer)
img = Image.alpha_composite(final_shadow, img)
draw = ImageDraw.Draw(img) # 重置 draw
# --- 绘制背景 ---
if bg and bg.get("type") in ["box", "circle"]:
bg_color = self._parse_color(bg.get("color", "#000000"))
# TODO: 支持渐变背景
self._draw_background(draw, bg, center_x, center_y, content_w, content_h, bg_color)
# --- 绘制描边 (仅针对文字) ---
# 从外向内绘制
txt_x = center_x - text_w / 2
txt_y = center_y - text_h / 2
for s in reversed(strokes):
color = self._parse_color(s.get("color", "#000000"))
width = s.get("width", 0)
if width > 0:
# 通过偏移模拟描边 (Pillow stroke_width 效果一般,但这里先用原生参数)
draw.text((txt_x, txt_y), text, font=font, fill=color, stroke_width=width, stroke_fill=color)
# --- 绘制文字 ---
draw.text((txt_x, txt_y), text, font=font, fill=font_color)
# 7. 裁剪多余透明区域
bbox = img.getbbox()
if bbox:
img = img.crop(bbox)
# 8. 保存
output_path = str(CACHE_DIR / f"{cache_key}.png")
img.save(output_path)
logger.info(f"Rendered text: {text} -> {output_path}")
return output_path
def _draw_background(self, draw, bg, cx, cy, w, h, color):
"""绘制背景形状"""
corner_radius = bg.get("corner_radius", 0)
x0 = cx - w / 2
y0 = cy - h / 2
x1 = cx + w / 2
y1 = cy + h / 2
if bg.get("type") == "box":
draw.rounded_rectangle([x0, y0, x1, y1], radius=corner_radius, fill=color)
elif bg.get("type") == "circle":
draw.ellipse([x0, y0, x1, y1], fill=color)
# 全局单例
renderer = TextRenderer()

177
modules/utils.py Normal file
View File

@@ -0,0 +1,177 @@
"""
Gloda Video Factory - Utility Functions
Handles font management, Auto-QC, and helper effects.
"""
import os
import logging
from pathlib import Path
from typing import Optional, Tuple
import urllib.request
import math
import numpy as np
from PIL import Image
from moviepy.editor import ImageClip, VideoFileClip, AudioFileClip
import config
logger = logging.getLogger(__name__)
# Google Fonts CDN URL
ROBOTO_BOLD_URL = "https://github.com/googlefonts/roboto/raw/main/src/hinted/Roboto-Bold.ttf"
NOTO_SC_BOLD_URL = "https://raw.githubusercontent.com/google/fonts/main/ofl/notosanssc/NotoSansSC-Bold.ttf"
FONT_PATH_EN = config.FONTS_DIR / "Roboto-Bold.ttf"
FONT_PATH_CN = config.FONTS_DIR / "NotoSansSC-Bold.ttf"
def ensure_fonts() -> Path:
"""Ensure required fonts (EN & CN) are available."""
config.FONTS_DIR.mkdir(parents=True, exist_ok=True)
# English Font
if not FONT_PATH_EN.exists():
logger.info(f"Downloading Roboto-Bold font...")
try:
urllib.request.urlretrieve(ROBOTO_BOLD_URL, FONT_PATH_EN)
except Exception as e:
logger.error(f"Failed to download EN font: {e}")
# Chinese Font
if not FONT_PATH_CN.exists():
logger.info(f"Downloading NotoSansSC-Bold font...")
try:
# Using a reliable mirror or source if Github raw is flaky, but trying Github first
urllib.request.urlretrieve(NOTO_SC_BOLD_URL, FONT_PATH_CN)
except Exception as e:
logger.error(f"Failed to download CN font: {e}")
# Return CN font as default for mixed text
if FONT_PATH_CN.exists():
return FONT_PATH_CN
return FONT_PATH_EN
def check_imagemagick() -> bool:
"""Check if ImageMagick is installed."""
import shutil
if shutil.which("convert"):
return True
else:
logger.warning("ImageMagick not found. Text overlays may fail.")
return False
def verify_assets(video_path: str, audio_path: str) -> Tuple[bool, str]:
"""
Auto-QC: Verify generated assets quality.
Checks:
1. File size sanity check
2. Duration matching (+/- 2s tolerance)
3. Audio silence check
Returns:
(Passed: bool, Reason: str)
"""
logger.info(f"Running Auto-QC on:\nVideo: {video_path}\nAudio: {audio_path}")
try:
# 1. File Size Check
vid_size = os.path.getsize(video_path)
if vid_size < 50 * 1024: # < 50KB
return False, f"Video file too small ({vid_size/1024:.1f}KB). Likely error/black screen."
aud_size = os.path.getsize(audio_path)
if aud_size < 5 * 1024: # < 5KB
return False, f"Audio file too small ({aud_size/1024:.1f}KB)."
# 2. Duration Check
try:
v_clip = VideoFileClip(video_path)
a_clip = AudioFileClip(audio_path)
v_dur = v_clip.duration
a_dur = a_clip.duration
# Check for silence (RMS)
# Read first 2 seconds of audio
chunk = a_clip.to_soundarray(fps=44100, nbytes=2, buffersize=1000)
if chunk is not None:
rms = np.sqrt(np.mean(chunk**2))
if rms < 0.001:
v_clip.close()
a_clip.close()
return False, "Audio appears to be silent (RMS < 0.001)"
v_clip.close()
a_clip.close()
# Tolerance check
if abs(v_dur - a_dur) > 2.0:
return False, f"Duration mismatch: Video={v_dur:.1f}s, Audio={a_dur:.1f}s"
except Exception as e:
return False, f"Media analysis failed: {str(e)}"
return True, "QC Passed"
except Exception as e:
logger.error(f"Auto-QC Error: {e}")
return False, f"QC System Error: {e}"
def apply_ken_burns(
image_path: str,
duration: float = 5.0,
zoom_ratio: float = 1.2,
output_path: Optional[str] = None
) -> str:
"""Apply Ken Burns effect (slow zoom in) to a static image."""
if output_path is None:
base_name = Path(image_path).stem
output_path = str(config.OUTPUT_DIR / f"{base_name}_ken_burns.mp4")
logger.info(f"Applying Ken Burns effect to {image_path}")
img = Image.open(image_path)
img_width, img_height = img.size
target_width = config.VIDEO_SETTINGS["width"]
target_height = config.VIDEO_SETTINGS["height"]
fps = config.VIDEO_SETTINGS["fps"]
scale_w = (target_width * zoom_ratio) / img_width
scale_h = (target_height * zoom_ratio) / img_height
base_scale = max(scale_w, scale_h)
new_width = int(img_width * base_scale)
new_height = int(img_height * base_scale)
img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
img_array = np.array(img_resized)
def make_frame(t):
progress = t / duration
eased_progress = 0.5 - 0.5 * np.cos(np.pi * progress)
current_zoom = 1 + (zoom_ratio - 1) * eased_progress
crop_width = int(target_width / current_zoom * (new_width / target_width))
crop_height = int(target_height / current_zoom * (new_height / target_height))
crop_width = min(crop_width, new_width)
crop_height = min(crop_height, new_height)
x_start = (new_width - crop_width) // 2
y_start = (new_height - crop_height) // 2
cropped = img_array[y_start:y_start + crop_height, x_start:x_start + crop_width]
cropped_pil = Image.fromarray(cropped)
resized = cropped_pil.resize((target_width, target_height), Image.Resampling.LANCZOS)
return np.array(resized)
clip = ImageClip(make_frame, duration=duration)
clip = clip.set_fps(fps)
clip.write_videofile(output_path, fps=fps, codec=config.VIDEO_SETTINGS["codec"], audio=False, logger=None)
clip.close()
return output_path

269
modules/video_gen.py Normal file
View File

@@ -0,0 +1,269 @@
"""
图生视频模块 (Volcengine Doubao-SeedDance)
负责将分镜图片转换为视频片段
"""
import logging
import time
import requests
import os
from typing import Dict, Any, List, Optional
from pathlib import Path
import config
from modules import storage
from modules.db_manager import db
logger = logging.getLogger(__name__)
class VideoGenerator:
"""图生视频生成器"""
def __init__(self):
self.api_key = config.VOLC_API_KEY
self.base_url = config.VOLC_BASE_URL
self.model_id = config.VIDEO_MODEL_ID
self.headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
def submit_scene_video_task(
self,
project_id: str,
scene_id: int,
image_path: str,
prompt: str
) -> str:
"""
提交单场景视频生成任务
Returns: task_id or None
"""
if not image_path or not os.path.exists(image_path):
logger.warning(f"Skipping video generation for Scene {scene_id}: Image not found")
return None
# 上传图片到 R2 获取 URL
logger.info(f"Uploading image for Scene {scene_id}...")
image_url = storage.upload_file(image_path)
if not image_url:
logger.error(f"Failed to upload image for Scene {scene_id}")
return None
logger.info(f"Submitting video task for Scene {scene_id}...")
task_id = self._submit_task(image_url, prompt)
if task_id:
# 立即保存 task_id 到数据库,状态为 processing
db.save_asset(
project_id=project_id,
scene_id=scene_id,
asset_type="video",
status="processing",
task_id=task_id,
local_path=None
)
return task_id
def recover_video_from_task(self, task_id: str, output_path: str) -> bool:
"""
尝试从已有的 task_id 恢复视频 (查询状态并下载)
"""
try:
status, video_url = self._check_task(task_id)
logger.info(f"Recovering task {task_id}: status={status}")
if status == "succeeded" and video_url:
downloaded_path = self._download_video(video_url, os.path.basename(output_path))
if downloaded_path:
# 如果下载的文件名和目标路径不一致 (download_video 使用 filename 参数拼接到 TEMP_DIR)
# 需要移动或确认。 _download_video 返回完整路径。
# 如果 output_path 是绝对路径且不同,则移动。
if os.path.abspath(downloaded_path) != os.path.abspath(output_path):
import shutil
shutil.move(downloaded_path, output_path)
return True
return False
except Exception as e:
logger.error(f"Failed to recover video task {task_id}: {e}")
return False
def check_task_status(self, task_id: str) -> tuple[str, str]:
"""
查询任务状态
Returns: (status, video_url)
"""
return self._check_task(task_id)
def generate_scene_videos(
self,
project_id: str,
script: Dict[str, Any],
scene_images: Dict[int, str]
) -> Dict[int, str]:
"""
批量生成分镜视频 (Legacy: 阻塞式轮询)
"""
generated_videos = {}
tasks = {} # scene_id -> task_id
scenes = script.get("scenes", [])
# 1. 提交所有任务
for scene in scenes:
scene_id = scene["id"]
image_path = scene_images.get(scene_id)
prompt = scene.get("video_prompt", "High quality video")
# Use new method signature with project_id
task_id = self.submit_scene_video_task(project_id, scene_id, image_path, prompt)
if task_id:
tasks[scene_id] = task_id
logger.info(f"Task submitted: {task_id}")
else:
logger.error(f"Failed to submit task for Scene {scene_id}")
# 2. 轮询任务状态
pending_tasks = list(tasks.keys())
# 设置最大轮询时间 (例如 10 分钟)
start_time = time.time()
timeout = 600
while pending_tasks and (time.time() - start_time < timeout):
logger.info(f"Polling status for {len(pending_tasks)} tasks...")
still_pending = []
for scene_id in pending_tasks:
task_id = tasks[scene_id]
status, result_url = self._check_task(task_id)
if status == "succeeded":
logger.info(f"Scene {scene_id} video generated successfully")
# 下载视频
video_path = self._download_video(result_url, f"scene_{scene_id}_video.mp4")
if video_path:
generated_videos[scene_id] = video_path
# Update DB
db.save_asset(
project_id=project_id,
scene_id=scene_id,
asset_type="video",
status="completed",
local_path=video_path,
task_id=task_id
)
elif status == "failed" or status == "cancelled":
logger.error(f"Scene {scene_id} task failed/cancelled")
db.save_asset(
project_id=project_id,
scene_id=scene_id,
asset_type="video",
status="failed",
task_id=task_id
)
else:
# running, queued
still_pending.append(scene_id)
pending_tasks = still_pending
if pending_tasks:
time.sleep(5) # 间隔 5 秒
return generated_videos
def _submit_task(self, image_url: str, prompt: str) -> str:
"""提交生成任务"""
url = f"{self.base_url}/contents/generations/tasks"
payload = {
"model": self.model_id,
"content": [
{
"type": "text",
"text": f"{prompt} --resolution 1080p --duration 3 --camerafixed false --watermark false"
},
{
"type": "image_url",
"image_url": {"url": image_url}
}
]
}
try:
response = requests.post(url, headers=self.headers, json=payload, timeout=30)
response.raise_for_status()
data = response.json()
# ID might be at top level or in data object depending on exact API version response
# Document says: { "id": "...", "status": "..." } or similar
task_id = data.get("id")
if not task_id and "data" in data:
task_id = data.get("data", {}).get("id")
return task_id
except Exception as e:
logger.error(f"Task submission failed: {e}")
if 'response' in locals():
logger.error(f"Response: {response.text}")
return None
def _check_task(self, task_id: str) -> tuple[str, str]:
"""
检查任务状态
Returns: (status, content_url)
Status: queued, running, succeeded, failed, cancelled
"""
url = f"{self.base_url}/contents/generations/tasks/{task_id}"
try:
response = requests.get(url, headers=self.headers, timeout=30)
response.raise_for_status()
data = response.json()
# API Response structure:
# { "id": "...", "status": "succeeded", "content": [ { "url": "...", "video_url": "..." } ] }
# Or nested in "data" key
result = data
if "data" in data and "status" not in data: # Check if wrapped in data
result = data["data"]
status = result.get("status")
content_url = None
if status == "succeeded":
if "content" in result:
content = result["content"]
if isinstance(content, list) and len(content) > 0:
item = content[0]
content_url = item.get("video_url") or item.get("url")
elif isinstance(content, dict):
content_url = content.get("video_url") or content.get("url")
return status, content_url
except Exception as e:
logger.error(f"Check task failed: {e}")
return "unknown", None
def _download_video(self, url: str, filename: str) -> str:
"""下载视频到临时目录"""
if not url:
return None
try:
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
output_path = config.TEMP_DIR / filename
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return str(output_path)
except Exception as e:
logger.error(f"Download video failed: {e}")
return None