Files
video-flow/modules/composer.py
Tony Zhang 33a165a615 feat: video-flow initial commit
- app.py: Streamlit UI for video generation workflow
- main_flow.py: CLI tool with argparse support
- modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.)
- config.py: Configuration with API keys and paths
- requirements.txt: Python dependencies
- docs/: System prompt documentation
2025-12-12 19:18:27 +08:00

718 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
视频合成器模块
整合视频拼接、花字叠加、旁白配音的完整流程
"""
import os
import time
import logging
from pathlib import Path
from typing import Dict, Any, List, Optional, Union
import config
from modules import ffmpeg_utils, fancy_text, factory, storage
from modules.text_renderer import renderer
logger = logging.getLogger(__name__)
class VideoComposer:
"""视频合成器"""
def __init__(
self,
output_dir: str = None,
target_size: tuple = (1080, 1920),
voice_type: str = "sweet_female"
):
"""
初始化合成器
Args:
output_dir: 输出目录
target_size: 目标分辨率 (width, height)
voice_type: 默认旁白音色
"""
self.output_dir = Path(output_dir) if output_dir else config.OUTPUT_DIR
self.output_dir.mkdir(exist_ok=True)
self.target_size = target_size
self.voice_type = voice_type
# 临时文件追踪
self._temp_files = []
def _add_temp(self, path: str):
"""记录临时文件"""
if path:
self._temp_files.append(path)
def cleanup(self):
"""清理临时文件"""
for f in self._temp_files:
try:
if os.path.exists(f):
os.remove(f)
except Exception as e:
logger.warning(f"Failed to cleanup {f}: {e}")
self._temp_files = []
def compose(
self,
video_paths: List[str],
subtitles: List[Dict[str, Any]] = None,
fancy_texts: List[Dict[str, Any]] = None,
voiceover_text: str = None,
voiceover_segments: List[Dict[str, Any]] = None,
bgm_path: str = None,
bgm_volume: float = 0.15,
output_name: str = None,
upload_to_r2: bool = False
) -> str:
"""
完整视频合成流程
Args:
video_paths: 分镜视频路径列表
subtitles: 字幕配置列表 [{text, start, duration, style}]
fancy_texts: 花字配置列表 [{text, style, x, y, start, duration}]
voiceover_text: 完整旁白文本(会自动生成并混音)
voiceover_segments: 分段旁白配置 [{text, start}],与 voiceover_text 二选一
bgm_path: 背景音乐路径
bgm_volume: BGM音量
output_name: 输出文件名(不含扩展名)
upload_to_r2: 是否上传到R2存储
Returns:
最终视频路径或R2 URL
"""
if not video_paths:
raise ValueError("No video paths provided")
timestamp = int(time.time())
output_name = output_name or f"composed_{timestamp}"
logger.info(f"Starting composition: {len(video_paths)} videos")
try:
# Step 1: 拼接视频
merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
self._add_temp(merged_path)
current_video = merged_path
# Step 1.1: 若无音轨,补一条静音底,避免后续滤镜找不到 0:a
silent_path = str(config.TEMP_DIR / f"{output_name}_silent.mp4")
ffmpeg_utils.add_silence_audio(current_video, silent_path)
self._add_temp(silent_path)
current_video = silent_path
# Step 2: 添加字幕 (白字黑边,无底框,下半区域居中)
if subtitles:
subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
subtitle_style = {
"font": ffmpeg_utils._get_font_path(),
"fontsize": 60,
"fontcolor": "white",
"borderw": 5,
"bordercolor": "black",
"box": 0, # 无底框
"y": "h-200", # 下半区域居中
}
ffmpeg_utils.add_multiple_subtitles(
current_video, subtitles, subtitled_path, default_style=subtitle_style
)
self._add_temp(subtitled_path)
current_video = subtitled_path
# Step 3: 叠加花字 (支持原子化参数)
if fancy_texts:
overlay_configs = []
for ft in fancy_texts:
text = ft.get("text", "")
style = ft.get("style")
custom_style = ft.get("custom_style")
# 如果 style 是字典,说明是原子化参数,直接使用
if isinstance(style, dict):
img_path = renderer.render(text, style, cache=False)
elif custom_style and isinstance(custom_style, dict):
# 兼容旧逻辑:如果有 custom_style尝试通过原子化渲染器渲染
if "font_size" in custom_style:
img_path = renderer.render(text, custom_style, cache=False)
else:
# 回退到旧版 fancy_text
img_path = fancy_text.create_fancy_text(
text=text,
style=style if isinstance(style, str) else "subtitle",
custom_style={
**(custom_style or {}),
"font_name": "/System/Library/Fonts/PingFang.ttc",
},
cache=False
)
else:
# 旧版逻辑
img_path = fancy_text.create_fancy_text(
text=text,
style=style if isinstance(style, str) else "subtitle",
custom_style={
"font_name": "/System/Library/Fonts/PingFang.ttc",
},
cache=False
)
overlay_configs.append({
"path": img_path,
"x": ft.get("x", "(W-w)/2"),
"y": ft.get("y", "(H-h)/2"),
"start": ft.get("start", 0),
"duration": ft.get("duration", 999)
})
fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
ffmpeg_utils.overlay_multiple_images(
current_video, overlay_configs, fancy_path
)
self._add_temp(fancy_path)
current_video = fancy_path
# Step 4: 生成并混合旁白(火山 WS 优先,失败回退 Edge
if voiceover_text:
vo_path = factory.generate_voiceover_volcengine(
text=voiceover_text,
voice_type=self.voice_type
)
self._add_temp(vo_path)
voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
ffmpeg_utils.mix_audio(
current_video, vo_path, voiced_path,
audio_volume=1.5,
video_volume=0.2
)
self._add_temp(voiced_path)
current_video = voiced_path
elif voiceover_segments:
current_video = self._add_segmented_voiceover(
current_video, voiceover_segments, output_name
)
# Step 5: 添加BGM淡入淡出若 duck 失败会自动退回低音量混合)
if bgm_path:
bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
ffmpeg_utils.add_bgm(
current_video, bgm_path, bgm_output,
bgm_volume=bgm_volume,
ducking=False, # 为避免兼容性问题,这里禁用 duck保持低音量
duck_gain_db=-6.0,
fade_in=1.0,
fade_out=1.0
)
self._add_temp(bgm_output)
current_video = bgm_output
# Step 6: 输出最终文件
final_path = str(self.output_dir / f"{output_name}.mp4")
# 复制到输出目录
import shutil
shutil.copy(current_video, final_path)
logger.info(f"Composition complete: {final_path}")
# 上传到R2
if upload_to_r2:
r2_url = storage.upload_file(final_path)
logger.info(f"Uploaded to R2: {r2_url}")
return r2_url
return final_path
finally:
# 清理临时文件(保留最终输出)
self.cleanup()
def _add_segmented_voiceover(
self,
video_path: str,
segments: List[Dict[str, Any]],
output_name: str
) -> str:
"""添加分段旁白"""
if not segments:
return video_path
# 为每段生成音频
audio_files = []
for i, seg in enumerate(segments):
text = seg.get("text", "")
if not text:
continue
voice = seg.get("voice_type", self.voice_type)
audio_path = factory.generate_voiceover_volcengine(
text=text,
voice_type=voice,
output_path=str(config.TEMP_DIR / f"{output_name}_seg_{i}.mp3")
)
if audio_path:
audio_files.append({
"path": audio_path,
"start": seg.get("start", 0)
})
self._add_temp(audio_path)
if not audio_files:
return video_path
# 依次混入音频
current = video_path
for i, af in enumerate(audio_files):
output = str(config.TEMP_DIR / f"{output_name}_seg_mixed_{i}.mp4")
ffmpeg_utils.mix_audio(
current, af["path"], output,
audio_volume=1.0,
video_volume=0.2 if i == 0 else 1.0, # 只在第一次降低原视频音量
audio_start=af["start"]
)
self._add_temp(output)
current = output
return current
def compose_from_script(
self,
script: Dict[str, Any],
video_map: Dict[int, str],
bgm_path: str = None,
output_name: str = None
) -> str:
"""
基于生成脚本和视频映射进行合成
Args:
script: 标准化分镜脚本
video_map: 场景ID到视频路径的映射
bgm_path: BGM路径
output_name: 输出文件名
"""
scenes = script.get("scenes", [])
if not scenes:
raise ValueError("Empty script")
video_paths = []
fancy_texts = []
# 1. 收集视频路径和花字 (按分镜顺序)
total_duration = 0.0
for scene in scenes:
scene_id = scene["id"]
video_path = video_map.get(scene_id)
if not video_path or not os.path.exists(video_path):
logger.warning(f"Missing video for scene {scene_id}, skipping")
continue
# 获取实际视频时长
try:
info = ffmpeg_utils.get_video_info(video_path)
duration = float(info.get("duration", 5.0))
except:
duration = 5.0
video_paths.append(video_path)
# 花字 (白字黑边,无底框,固定在上半区域居中)
if "fancy_text" in scene:
ft = scene["fancy_text"]
if isinstance(ft, dict):
text = ft.get("text", "")
if text:
# 固定样式:白字黑边,无底框
fixed_style = {
"font_size": 72,
"font_color": "#FFFFFF",
"stroke": {"color": "#000000", "width": 5}
# 无 background不加底框
}
fancy_texts.append({
"text": text,
"style": fixed_style,
"x": "(W-w)/2", # 居中
"y": "180", # 上半区域
"start": total_duration + float(ft.get("start_time", 0)),
"duration": float(ft.get("duration", duration))
})
total_duration += duration
# 2. 拼接视频
timestamp = int(time.time())
output_name = output_name or f"composed_{timestamp}"
merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
self._add_temp(merged_path)
current_video = merged_path
# 3. 处理整体旁白时间轴 (New Logic)
voiceover_timeline = script.get("voiceover_timeline", [])
mixed_audio_path = str(config.TEMP_DIR / f"{output_name}_mixed_vo.mp3")
# 初始化静音底轨 (长度为 total_duration)
ffmpeg_utils._run_ffmpeg([
ffmpeg_utils.FFMPEG_PATH, "-y",
"-f", "lavfi", "-i", "anullsrc=r=44100:cl=stereo",
"-t", str(total_duration),
"-c:a", "mp3",
mixed_audio_path
])
self._add_temp(mixed_audio_path)
subtitles = []
if voiceover_timeline:
for i, item in enumerate(voiceover_timeline):
text = item.get("text", "")
sub_text = item.get("subtitle", text)
# 支持两种格式:
# 新格式: start_time (秒), duration (秒) - 直接使用绝对时间
# 旧格式: start_ratio (0-1), duration_ratio (0-1) - 按比例计算
if "start_time" in item:
# 新格式:直接使用秒
target_start = float(item.get("start_time", 0))
target_duration = float(item.get("duration", 3))
else:
# 旧格式:按比例计算(向后兼容)
start_ratio = float(item.get("start_ratio", 0))
duration_ratio = float(item.get("duration_ratio", 0))
target_start = start_ratio * total_duration
target_duration = duration_ratio * total_duration
if not text: continue
# 生成 TTS
tts_path = factory.generate_voiceover_volcengine(
text=text,
voice_type=self.voice_type,
output_path=str(config.TEMP_DIR / f"{output_name}_vo_{i}.mp3")
)
self._add_temp(tts_path)
# 调整时长
adjusted_path = str(config.TEMP_DIR / f"{output_name}_vo_adj_{i}.mp3")
ffmpeg_utils.adjust_audio_duration(tts_path, target_duration, adjusted_path)
self._add_temp(adjusted_path)
# 混合到总音轨
new_mixed = str(config.TEMP_DIR / f"{output_name}_mixed_{i}.mp3")
ffmpeg_utils.mix_audio_at_offset(mixed_audio_path, adjusted_path, target_start, new_mixed)
mixed_audio_path = new_mixed # Update current mixed path
self._add_temp(new_mixed)
# 添加字幕配置 (完全同步)
subtitles.append({
"text": ffmpeg_utils.wrap_text_smart(sub_text),
"start": target_start,
"duration": target_duration,
"style": {} # Default
})
# 4. 将合成好的旁白混入视频
voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
ffmpeg_utils.mix_audio(
current_video, mixed_audio_path, voiced_path,
audio_volume=1.5,
video_volume=0.2 # 压低原音
)
self._add_temp(voiced_path)
current_video = voiced_path
# 5. 添加字幕 (使用新的 ffmpeg_utils.add_multiple_subtitles)
if subtitles:
subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
subtitle_style = {
"font": ffmpeg_utils._get_font_path(),
"fontsize": 60,
"fontcolor": "white",
"borderw": 5,
"bordercolor": "black",
"box": 0, # 无底框
"y": "h-200", # 下半区域居中
}
ffmpeg_utils.add_multiple_subtitles(
current_video, subtitles, subtitled_path, default_style=subtitle_style
)
self._add_temp(subtitled_path)
current_video = subtitled_path
# 6. 添加花字
if fancy_texts:
fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
overlay_configs = []
for ft in fancy_texts:
# 渲染花字图片
img_path = renderer.render(ft["text"], ft["style"], cache=False)
overlay_configs.append({
"path": img_path,
"x": ft["x"],
"y": ft["y"],
"start": ft["start"],
"duration": ft["duration"]
})
ffmpeg_utils.overlay_multiple_images(
current_video, overlay_configs, fancy_path
)
self._add_temp(fancy_path)
current_video = fancy_path
# 7. 添加 BGM
if bgm_path:
bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
ffmpeg_utils.add_bgm(
current_video, bgm_path, bgm_output,
bgm_volume=0.15
)
self._add_temp(bgm_output)
current_video = bgm_output
# 8. 输出最终文件
final_path = str(self.output_dir / f"{output_name}.mp4")
import shutil
shutil.copy(current_video, final_path)
logger.info(f"Composition complete: {final_path}")
self.cleanup()
return final_path
def compose_standard_task(self, task_config: Dict[str, Any]) -> str:
"""
执行标准合成任务 (Legacy)
"""
settings = task_config.get("settings", {})
self.voice_type = settings.get("voice_type", self.voice_type)
# 1. 准备视频片段
video_paths = []
for seg in task_config.get("segments", []):
path = seg.get("path") or seg.get("video_path")
if not path: continue
video_paths.append(path)
# 2. 解析时间轴
subtitles = []
fancy_texts = []
voiceover_segments = []
for item in task_config.get("timeline", []):
itype = item.get("type")
if not itype:
if "text" in item and ("style" in item or "x" in item or "y" in item):
itype = "fancy_text"
elif "text" in item and "duration" in item and "start" in item:
itype = "subtitle"
elif "text" in item and "start" in item:
itype = "voiceover"
else:
continue
if itype == "subtitle":
subtitles.append(item)
elif itype == "fancy_text":
if "x" not in item and "position" in item:
item["x"] = item["position"].get("x")
item["y"] = item["position"].get("y")
fancy_texts.append(item)
elif itype == "voiceover":
voiceover_segments.append(item)
return self.compose(
video_paths=video_paths,
subtitles=subtitles,
fancy_texts=fancy_texts,
voiceover_segments=voiceover_segments,
bgm_path=settings.get("bgm_path"),
bgm_volume=settings.get("bgm_volume", 0.06),
output_name=settings.get("output_name"),
upload_to_r2=settings.get("upload_to_r2", False)
)
def compose_product_video(
video_paths: List[str],
subtitle_configs: List[Dict[str, Any]] = None,
fancy_text_configs: List[Dict[str, Any]] = None,
voiceover_text: str = None,
bgm_path: str = None,
output_path: str = None,
voice_type: str = "sweet_female"
) -> str:
"""便捷函数:合成商品短视频"""
composer = VideoComposer(voice_type=voice_type)
output_name = None
if output_path:
output_name = Path(output_path).stem
composer.output_dir = Path(output_path).parent
return composer.compose(
video_paths=video_paths,
subtitles=subtitle_configs,
fancy_texts=fancy_text_configs,
voiceover_text=voiceover_text,
bgm_path=bgm_path,
output_name=output_name
)
def quick_compose(
video_folder: str,
script: List[Dict[str, Any]],
output_path: str = None,
voice_type: str = "sweet_female",
bgm_path: str = None
) -> str:
"""快速合成:从文件夹读取视频,配合脚本合成"""
folder = Path(video_folder)
video_files = sorted([
f for f in folder.iterdir()
if f.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv']
])
video_paths = []
subtitles = []
fancy_texts = []
voiceovers = []
current_time = 0
for i, item in enumerate(script):
if "video" in item:
vp = folder / item["video"]
elif i < len(video_files):
vp = video_files[i]
else:
logger.warning(f"No video for script item {i}")
continue
video_paths.append(str(vp))
try:
info = ffmpeg_utils.get_video_info(str(vp))
duration = info.get("duration", 5)
except:
duration = item.get("duration", 5)
if "subtitle" in item:
subtitles.append({
"text": item["subtitle"],
"start": current_time,
"duration": duration,
"style": item.get("subtitle_style", {})
})
if "fancy_text" in item:
ft = item["fancy_text"]
if isinstance(ft, str):
ft = {"text": ft}
fancy_texts.append({
"text": ft.get("text", ""),
"style": ft.get("style", "highlight"),
"custom_style": ft.get("custom_style"),
"x": ft.get("x", "(W-w)/2"),
"y": ft.get("y", 200),
"start": current_time,
"duration": duration
})
if "voiceover" in item:
voiceovers.append(item["voiceover"])
current_time += duration
voiceover_text = "".join(voiceovers) if voiceovers else None
return compose_product_video(
video_paths=video_paths,
subtitle_configs=subtitles if subtitles else None,
fancy_text_configs=fancy_texts if fancy_texts else None,
voiceover_text=voiceover_text,
bgm_path=bgm_path,
output_path=output_path,
voice_type=voice_type
)
# ============================================================
# 示例用法
# ============================================================
def example_hairclip_video():
"""示例:发夹商品视频合成"""
素材目录 = Path("/Volumes/Tony/video-flow/素材/发夹/合成图拆分镜")
video_paths = [
str(素材目录 / "视频-分镜1.mp4"),
str(素材目录 / "视频-分镜2.mp4"),
str(素材目录 / "视频-分镜3.mp4"),
str(素材目录 / "视频-分镜4.mp4"),
str(素材目录 / "视频-分镜5.mp4"),
]
script = [
{
"subtitle": "塌马尾 vs 高颅顶",
"fancy_text": {
"text": "塌马尾 vs 高颅顶",
"style": "comparison",
"y": 150
},
"voiceover": "普通马尾和高颅顶马尾的区别,你看出来了吗",
},
{
"subtitle": "3秒出门无需皮筋",
"fancy_text": {"text": "发量+50%", "style": "bubble", "y": 300},
"voiceover": "只需要三秒钟,不需要皮筋,发量瞬间增加百分之五十",
},
{
"subtitle": "发量+50%",
"voiceover": "蓬松的高颅顶效果,让你瞬间变美",
},
{
"subtitle": "狂甩不掉!",
"fancy_text": {"text": "狂甩不掉!", "style": "warning", "y": 400},
"voiceover": "而且超级牢固,怎么甩都不会掉",
},
{
"subtitle": "¥3.99 立即抢购",
"fancy_text": {"text": "3.99", "style": "price", "y": 500},
"voiceover": "只要三块九毛九,点击下方链接立即购买",
},
]
output = quick_compose(
video_folder=str(素材目录),
script=script,
output_path="/Volumes/Tony/video-flow/output/发夹_合成视频.mp4",
voice_type="sweet_female"
)
print(f"视频合成完成: {output}")
return output
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
example_hairclip_video()