feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions
--- a/modules/composer.py
+++ b/modules/composer.py
@@ -0,0 +1,717 @@
+"""
+视频合成器模块
+整合视频拼接、花字叠加、旁白配音的完整流程
+"""
+import os
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Union
+
+import config
+from modules import ffmpeg_utils, fancy_text, factory, storage
+from modules.text_renderer import renderer
+
+logger = logging.getLogger(__name__)
+
+
+class VideoComposer:
+    """视频合成器"""
+    
+    def __init__(
+        self,
+        output_dir: str = None,
+        target_size: tuple = (1080, 1920),
+        voice_type: str = "sweet_female"
+    ):
+        """
+        初始化合成器
+        
+        Args:
+            output_dir: 输出目录
+            target_size: 目标分辨率 (width, height)
+            voice_type: 默认旁白音色
+        """
+        self.output_dir = Path(output_dir) if output_dir else config.OUTPUT_DIR
+        self.output_dir.mkdir(exist_ok=True)
+        self.target_size = target_size
+        self.voice_type = voice_type
+        
+        # 临时文件追踪
+        self._temp_files = []
+    
+    def _add_temp(self, path: str):
+        """记录临时文件"""
+        if path:
+            self._temp_files.append(path)
+    
+    def cleanup(self):
+        """清理临时文件"""
+        for f in self._temp_files:
+            try:
+                if os.path.exists(f):
+                    os.remove(f)
+            except Exception as e:
+                logger.warning(f"Failed to cleanup {f}: {e}")
+        self._temp_files = []
+    
+    def compose(
+        self,
+        video_paths: List[str],
+        subtitles: List[Dict[str, Any]] = None,
+        fancy_texts: List[Dict[str, Any]] = None,
+        voiceover_text: str = None,
+        voiceover_segments: List[Dict[str, Any]] = None,
+        bgm_path: str = None,
+        bgm_volume: float = 0.15,
+        output_name: str = None,
+        upload_to_r2: bool = False
+    ) -> str:
+        """
+        完整视频合成流程
+        
+        Args:
+            video_paths: 分镜视频路径列表
+            subtitles: 字幕配置列表 [{text, start, duration, style}]
+            fancy_texts: 花字配置列表 [{text, style, x, y, start, duration}]
+            voiceover_text: 完整旁白文本（会自动生成并混音）
+            voiceover_segments: 分段旁白配置 [{text, start}]，与 voiceover_text 二选一
+            bgm_path: 背景音乐路径
+            bgm_volume: BGM音量
+            output_name: 输出文件名（不含扩展名）
+            upload_to_r2: 是否上传到R2存储
+        
+        Returns:
+            最终视频路径（或R2 URL）
+        """
+        if not video_paths:
+            raise ValueError("No video paths provided")
+        
+        timestamp = int(time.time())
+        output_name = output_name or f"composed_{timestamp}"
+        
+        logger.info(f"Starting composition: {len(video_paths)} videos")
+        
+        try:
+            # Step 1: 拼接视频
+            merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
+            ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
+            self._add_temp(merged_path)
+            current_video = merged_path
+
+            # Step 1.1: 若无音轨，补一条静音底，避免后续滤镜找不到 0:a
+            silent_path = str(config.TEMP_DIR / f"{output_name}_silent.mp4")
+            ffmpeg_utils.add_silence_audio(current_video, silent_path)
+            self._add_temp(silent_path)
+            current_video = silent_path
+            
+            # Step 2: 添加字幕 (白字黑边，无底框，下半区域居中)
+            if subtitles:
+                subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
+                subtitle_style = {
+                    "font": ffmpeg_utils._get_font_path(),
+                    "fontsize": 60,
+                    "fontcolor": "white",
+                    "borderw": 5,
+                    "bordercolor": "black",
+                    "box": 0,  # 无底框
+                    "y": "h-200",  # 下半区域居中
+                }
+                ffmpeg_utils.add_multiple_subtitles(
+                    current_video, subtitles, subtitled_path, default_style=subtitle_style
+                )
+                self._add_temp(subtitled_path)
+                current_video = subtitled_path
+            
+            # Step 3: 叠加花字 (支持原子化参数)
+            if fancy_texts:
+                overlay_configs = []
+                for ft in fancy_texts:
+                    text = ft.get("text", "")
+                    style = ft.get("style")
+                    custom_style = ft.get("custom_style")
+                    
+                    # 如果 style 是字典，说明是原子化参数，直接使用
+                    if isinstance(style, dict):
+                        img_path = renderer.render(text, style, cache=False)
+                    elif custom_style and isinstance(custom_style, dict):
+                        # 兼容旧逻辑：如果有 custom_style，尝试通过原子化渲染器渲染
+                        if "font_size" in custom_style:
+                             img_path = renderer.render(text, custom_style, cache=False)
+                        else:
+                             # 回退到旧版 fancy_text
+                             img_path = fancy_text.create_fancy_text(
+                                text=text,
+                                style=style if isinstance(style, str) else "subtitle",
+                                custom_style={
+                                    **(custom_style or {}),
+                                    "font_name": "/System/Library/Fonts/PingFang.ttc",
+                                },
+                                cache=False
+                            )
+                    else:
+                        # 旧版逻辑
+                        img_path = fancy_text.create_fancy_text(
+                            text=text,
+                            style=style if isinstance(style, str) else "subtitle",
+                            custom_style={
+                                "font_name": "/System/Library/Fonts/PingFang.ttc",
+                            },
+                            cache=False
+                        )
+                        
+                    overlay_configs.append({
+                        "path": img_path,
+                        "x": ft.get("x", "(W-w)/2"),
+                        "y": ft.get("y", "(H-h)/2"),
+                        "start": ft.get("start", 0),
+                        "duration": ft.get("duration", 999)
+                    })
+                
+                fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
+                ffmpeg_utils.overlay_multiple_images(
+                    current_video, overlay_configs, fancy_path
+                )
+                self._add_temp(fancy_path)
+                current_video = fancy_path
+            
+            # Step 4: 生成并混合旁白（火山 WS 优先，失败回退 Edge）
+            if voiceover_text:
+                vo_path = factory.generate_voiceover_volcengine(
+                    text=voiceover_text,
+                    voice_type=self.voice_type
+                )
+                self._add_temp(vo_path)
+                
+                voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
+                ffmpeg_utils.mix_audio(
+                    current_video, vo_path, voiced_path,
+                    audio_volume=1.5,
+                    video_volume=0.2
+                )
+                self._add_temp(voiced_path)
+                current_video = voiced_path
+            
+            elif voiceover_segments:
+                current_video = self._add_segmented_voiceover(
+                    current_video, voiceover_segments, output_name
+                )
+            
+            # Step 5: 添加BGM（淡入淡出，若 duck 失败会自动退回低音量混合）
+            if bgm_path:
+                bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
+                ffmpeg_utils.add_bgm(
+                    current_video, bgm_path, bgm_output,
+                    bgm_volume=bgm_volume,
+                    ducking=False,  # 为避免兼容性问题，这里禁用 duck，保持低音量
+                    duck_gain_db=-6.0,
+                    fade_in=1.0,
+                    fade_out=1.0
+                )
+                self._add_temp(bgm_output)
+                current_video = bgm_output
+            
+            # Step 6: 输出最终文件
+            final_path = str(self.output_dir / f"{output_name}.mp4")
+            
+            # 复制到输出目录
+            import shutil
+            shutil.copy(current_video, final_path)
+            
+            logger.info(f"Composition complete: {final_path}")
+            
+            # 上传到R2
+            if upload_to_r2:
+                r2_url = storage.upload_file(final_path)
+                logger.info(f"Uploaded to R2: {r2_url}")
+                return r2_url
+            
+            return final_path
+            
+        finally:
+            # 清理临时文件（保留最终输出）
+            self.cleanup()
+    
+    def _add_segmented_voiceover(
+        self,
+        video_path: str,
+        segments: List[Dict[str, Any]],
+        output_name: str
+    ) -> str:
+        """添加分段旁白"""
+        if not segments:
+            return video_path
+        
+        # 为每段生成音频
+        audio_files = []
+        for i, seg in enumerate(segments):
+            text = seg.get("text", "")
+            if not text:
+                continue
+            
+            voice = seg.get("voice_type", self.voice_type)
+            audio_path = factory.generate_voiceover_volcengine(
+                text=text,
+                voice_type=voice,
+                output_path=str(config.TEMP_DIR / f"{output_name}_seg_{i}.mp3")
+            )
+            
+            if audio_path:
+                audio_files.append({
+                    "path": audio_path,
+                    "start": seg.get("start", 0)
+                })
+                self._add_temp(audio_path)
+        
+        if not audio_files:
+            return video_path
+        
+        # 依次混入音频
+        current = video_path
+        for i, af in enumerate(audio_files):
+            output = str(config.TEMP_DIR / f"{output_name}_seg_mixed_{i}.mp4")
+            ffmpeg_utils.mix_audio(
+                current, af["path"], output,
+                audio_volume=1.0,
+                video_volume=0.2 if i == 0 else 1.0,  # 只在第一次降低原视频音量
+                audio_start=af["start"]
+            )
+            self._add_temp(output)
+            current = output
+        
+        return current
+
+    def compose_from_script(
+        self,
+        script: Dict[str, Any],
+        video_map: Dict[int, str],
+        bgm_path: str = None,
+        output_name: str = None
+    ) -> str:
+        """
+        基于生成脚本和视频映射进行合成
+        
+        Args:
+            script: 标准化分镜脚本
+            video_map: 场景ID到视频路径的映射
+            bgm_path: BGM路径
+            output_name: 输出文件名
+        """
+        scenes = script.get("scenes", [])
+        if not scenes:
+            raise ValueError("Empty script")
+            
+        video_paths = []
+        fancy_texts = []
+        
+        # 1. 收集视频路径和花字 (按分镜顺序)
+        total_duration = 0.0
+        
+        for scene in scenes:
+            scene_id = scene["id"]
+            video_path = video_map.get(scene_id)
+            
+            if not video_path or not os.path.exists(video_path):
+                logger.warning(f"Missing video for scene {scene_id}, skipping")
+                continue
+            
+            # 获取实际视频时长
+            try:
+                info = ffmpeg_utils.get_video_info(video_path)
+                duration = float(info.get("duration", 5.0))
+            except:
+                duration = 5.0
+            
+            video_paths.append(video_path)
+            
+            # 花字 (白字黑边，无底框，固定在上半区域居中)
+            if "fancy_text" in scene:
+                ft = scene["fancy_text"]
+                if isinstance(ft, dict):
+                    text = ft.get("text", "")
+                    
+                    if text:
+                        # 固定样式：白字黑边，无底框
+                        fixed_style = {
+                            "font_size": 72,
+                            "font_color": "#FFFFFF",
+                            "stroke": {"color": "#000000", "width": 5}
+                            # 无 background，不加底框
+                        }
+
+                        fancy_texts.append({
+                            "text": text,
+                            "style": fixed_style,
+                            "x": "(W-w)/2",  # 居中
+                            "y": "180",      # 上半区域
+                            "start": total_duration + float(ft.get("start_time", 0)),
+                            "duration": float(ft.get("duration", duration))
+                        })
+            
+            total_duration += duration
+
+        # 2. 拼接视频
+        timestamp = int(time.time())
+        output_name = output_name or f"composed_{timestamp}"
+        
+        merged_path = str(config.TEMP_DIR / f"{output_name}_merged.mp4")
+        ffmpeg_utils.concat_videos(video_paths, merged_path, self.target_size)
+        self._add_temp(merged_path)
+        current_video = merged_path
+        
+        # 3. 处理整体旁白时间轴 (New Logic)
+        voiceover_timeline = script.get("voiceover_timeline", [])
+        mixed_audio_path = str(config.TEMP_DIR / f"{output_name}_mixed_vo.mp3")
+        
+        # 初始化静音底轨 (长度为 total_duration)
+        ffmpeg_utils._run_ffmpeg([
+            ffmpeg_utils.FFMPEG_PATH, "-y",
+            "-f", "lavfi", "-i", "anullsrc=r=44100:cl=stereo",
+            "-t", str(total_duration),
+            "-c:a", "mp3",
+            mixed_audio_path
+        ])
+        self._add_temp(mixed_audio_path)
+        
+        subtitles = []
+        
+        if voiceover_timeline:
+            for i, item in enumerate(voiceover_timeline):
+                text = item.get("text", "")
+                sub_text = item.get("subtitle", text)
+                
+                # 支持两种格式：
+                # 新格式: start_time (秒), duration (秒) - 直接使用绝对时间
+                # 旧格式: start_ratio (0-1), duration_ratio (0-1) - 按比例计算
+                if "start_time" in item:
+                    # 新格式：直接使用秒
+                    target_start = float(item.get("start_time", 0))
+                    target_duration = float(item.get("duration", 3))
+                else:
+                    # 旧格式：按比例计算（向后兼容）
+                    start_ratio = float(item.get("start_ratio", 0))
+                    duration_ratio = float(item.get("duration_ratio", 0))
+                    target_start = start_ratio * total_duration
+                    target_duration = duration_ratio * total_duration
+                
+                if not text: continue
+                
+                # 生成 TTS
+                tts_path = factory.generate_voiceover_volcengine(
+                    text=text,
+                    voice_type=self.voice_type,
+                    output_path=str(config.TEMP_DIR / f"{output_name}_vo_{i}.mp3")
+                )
+                self._add_temp(tts_path)
+                
+                # 调整时长
+                adjusted_path = str(config.TEMP_DIR / f"{output_name}_vo_adj_{i}.mp3")
+                ffmpeg_utils.adjust_audio_duration(tts_path, target_duration, adjusted_path)
+                self._add_temp(adjusted_path)
+                
+                # 混合到总音轨
+                new_mixed = str(config.TEMP_DIR / f"{output_name}_mixed_{i}.mp3")
+                ffmpeg_utils.mix_audio_at_offset(mixed_audio_path, adjusted_path, target_start, new_mixed)
+                mixed_audio_path = new_mixed # Update current mixed path
+                self._add_temp(new_mixed)
+                
+                # 添加字幕配置 (完全同步)
+                subtitles.append({
+                    "text": ffmpeg_utils.wrap_text_smart(sub_text),
+                    "start": target_start,
+                    "duration": target_duration,
+                    "style": {} # Default
+                })
+        
+        # 4. 将合成好的旁白混入视频
+        voiced_path = str(config.TEMP_DIR / f"{output_name}_voiced.mp4")
+        ffmpeg_utils.mix_audio(
+            current_video, mixed_audio_path, voiced_path,
+            audio_volume=1.5,
+            video_volume=0.2 # 压低原音
+        )
+        self._add_temp(voiced_path)
+        current_video = voiced_path
+        
+        # 5. 添加字幕 (使用新的 ffmpeg_utils.add_multiple_subtitles)
+        if subtitles:
+            subtitled_path = str(config.TEMP_DIR / f"{output_name}_subtitled.mp4")
+            subtitle_style = {
+                "font": ffmpeg_utils._get_font_path(),
+                "fontsize": 60,
+                "fontcolor": "white",
+                "borderw": 5,
+                "bordercolor": "black",
+                "box": 0,  # 无底框
+                "y": "h-200",  # 下半区域居中
+            }
+            ffmpeg_utils.add_multiple_subtitles(
+                current_video, subtitles, subtitled_path, default_style=subtitle_style
+            )
+            self._add_temp(subtitled_path)
+            current_video = subtitled_path
+            
+        # 6. 添加花字
+        if fancy_texts:
+            fancy_path = str(config.TEMP_DIR / f"{output_name}_fancy.mp4")
+            
+            overlay_configs = []
+            for ft in fancy_texts:
+                # 渲染花字图片
+                img_path = renderer.render(ft["text"], ft["style"], cache=False)
+                overlay_configs.append({
+                    "path": img_path,
+                    "x": ft["x"],
+                    "y": ft["y"],
+                    "start": ft["start"],
+                    "duration": ft["duration"]
+                })
+                
+            ffmpeg_utils.overlay_multiple_images(
+                current_video, overlay_configs, fancy_path
+            )
+            self._add_temp(fancy_path)
+            current_video = fancy_path
+            
+        # 7. 添加 BGM
+        if bgm_path:
+            bgm_output = str(config.TEMP_DIR / f"{output_name}_bgm.mp4")
+            ffmpeg_utils.add_bgm(
+                current_video, bgm_path, bgm_output,
+                bgm_volume=0.15
+            )
+            self._add_temp(bgm_output)
+            current_video = bgm_output
+            
+        # 8. 输出最终文件
+        final_path = str(self.output_dir / f"{output_name}.mp4")
+        import shutil
+        shutil.copy(current_video, final_path)
+        
+        logger.info(f"Composition complete: {final_path}")
+        
+        self.cleanup()
+        return final_path
+
+
+    def compose_standard_task(self, task_config: Dict[str, Any]) -> str:
+        """
+        执行标准合成任务 (Legacy)
+        """
+        settings = task_config.get("settings", {})
+        self.voice_type = settings.get("voice_type", self.voice_type)
+        
+        # 1. 准备视频片段
+        video_paths = []
+        for seg in task_config.get("segments", []):
+            path = seg.get("path") or seg.get("video_path")
+            if not path: continue
+            video_paths.append(path)
+            
+        # 2. 解析时间轴
+        subtitles = []
+        fancy_texts = []
+        voiceover_segments = []
+        
+        for item in task_config.get("timeline", []):
+            itype = item.get("type")
+
+            if not itype:
+                if "text" in item and ("style" in item or "x" in item or "y" in item):
+                    itype = "fancy_text"
+                elif "text" in item and "duration" in item and "start" in item:
+                    itype = "subtitle"
+                elif "text" in item and "start" in item:
+                    itype = "voiceover"
+                else:
+                    continue
+
+            if itype == "subtitle":
+                subtitles.append(item)
+            elif itype == "fancy_text":
+                if "x" not in item and "position" in item:
+                    item["x"] = item["position"].get("x")
+                    item["y"] = item["position"].get("y")
+                fancy_texts.append(item)
+            elif itype == "voiceover":
+                voiceover_segments.append(item)
+        
+        return self.compose(
+            video_paths=video_paths,
+            subtitles=subtitles,
+            fancy_texts=fancy_texts,
+            voiceover_segments=voiceover_segments,
+            bgm_path=settings.get("bgm_path"),
+            bgm_volume=settings.get("bgm_volume", 0.06),
+            output_name=settings.get("output_name"),
+            upload_to_r2=settings.get("upload_to_r2", False)
+        )
+
+
+def compose_product_video(
+    video_paths: List[str],
+    subtitle_configs: List[Dict[str, Any]] = None,
+    fancy_text_configs: List[Dict[str, Any]] = None,
+    voiceover_text: str = None,
+    bgm_path: str = None,
+    output_path: str = None,
+    voice_type: str = "sweet_female"
+) -> str:
+    """便捷函数：合成商品短视频"""
+    composer = VideoComposer(voice_type=voice_type)
+    
+    output_name = None
+    if output_path:
+        output_name = Path(output_path).stem
+        composer.output_dir = Path(output_path).parent
+    
+    return composer.compose(
+        video_paths=video_paths,
+        subtitles=subtitle_configs,
+        fancy_texts=fancy_text_configs,
+        voiceover_text=voiceover_text,
+        bgm_path=bgm_path,
+        output_name=output_name
+    )
+
+
+def quick_compose(
+    video_folder: str,
+    script: List[Dict[str, Any]],
+    output_path: str = None,
+    voice_type: str = "sweet_female",
+    bgm_path: str = None
+) -> str:
+    """快速合成：从文件夹读取视频，配合脚本合成"""
+    folder = Path(video_folder)
+    
+    video_files = sorted([
+        f for f in folder.iterdir() 
+        if f.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv']
+    ])
+    
+    video_paths = []
+    subtitles = []
+    fancy_texts = []
+    voiceovers = []
+    
+    current_time = 0
+    
+    for i, item in enumerate(script):
+        if "video" in item:
+            vp = folder / item["video"]
+        elif i < len(video_files):
+            vp = video_files[i]
+        else:
+            logger.warning(f"No video for script item {i}")
+            continue
+        
+        video_paths.append(str(vp))
+        
+        try:
+            info = ffmpeg_utils.get_video_info(str(vp))
+            duration = info.get("duration", 5)
+        except:
+            duration = item.get("duration", 5)
+        
+        if "subtitle" in item:
+            subtitles.append({
+                "text": item["subtitle"],
+                "start": current_time,
+                "duration": duration,
+                "style": item.get("subtitle_style", {})
+            })
+        
+        if "fancy_text" in item:
+            ft = item["fancy_text"]
+            if isinstance(ft, str):
+                ft = {"text": ft}
+            fancy_texts.append({
+                "text": ft.get("text", ""),
+                "style": ft.get("style", "highlight"),
+                "custom_style": ft.get("custom_style"),
+                "x": ft.get("x", "(W-w)/2"),
+                "y": ft.get("y", 200),
+                "start": current_time,
+                "duration": duration
+            })
+        
+        if "voiceover" in item:
+            voiceovers.append(item["voiceover"])
+        
+        current_time += duration
+    
+    voiceover_text = "。".join(voiceovers) if voiceovers else None
+    
+    return compose_product_video(
+        video_paths=video_paths,
+        subtitle_configs=subtitles if subtitles else None,
+        fancy_text_configs=fancy_texts if fancy_texts else None,
+        voiceover_text=voiceover_text,
+        bgm_path=bgm_path,
+        output_path=output_path,
+        voice_type=voice_type
+    )
+
+
+# ============================================================
+# 示例用法
+# ============================================================
+
+def example_hairclip_video():
+    """示例：发夹商品视频合成"""
+    素材目录 = Path("/Volumes/Tony/video-flow/素材/发夹/合成图拆分镜")
+    
+    video_paths = [
+        str(素材目录 / "视频-分镜1.mp4"),
+        str(素材目录 / "视频-分镜2.mp4"),
+        str(素材目录 / "视频-分镜3.mp4"),
+        str(素材目录 / "视频-分镜4.mp4"),
+        str(素材目录 / "视频-分镜5.mp4"),
+    ]
+    
+    script = [
+        {
+            "subtitle": "塌马尾 vs 高颅顶",
+            "fancy_text": {
+                "text": "塌马尾 vs 高颅顶",
+                "style": "comparison",
+                "y": 150
+            },
+            "voiceover": "普通马尾和高颅顶马尾的区别，你看出来了吗",
+        },
+        {
+            "subtitle": "3秒出门，无需皮筋",
+            "fancy_text": {"text": "发量+50%", "style": "bubble", "y": 300},
+            "voiceover": "只需要三秒钟，不需要皮筋，发量瞬间增加百分之五十",
+        },
+        {
+            "subtitle": "发量+50%",
+            "voiceover": "蓬松的高颅顶效果，让你瞬间变美",
+        },
+        {
+            "subtitle": "狂甩不掉！",
+            "fancy_text": {"text": "狂甩不掉！", "style": "warning", "y": 400},
+            "voiceover": "而且超级牢固，怎么甩都不会掉",
+        },
+        {
+            "subtitle": "¥3.99 立即抢购",
+            "fancy_text": {"text": "3.99", "style": "price", "y": 500},
+            "voiceover": "只要三块九毛九，点击下方链接立即购买",
+        },
+    ]
+    
+    output = quick_compose(
+        video_folder=str(素材目录),
+        script=script,
+        output_path="/Volumes/Tony/video-flow/output/发夹_合成视频.mp4",
+        voice_type="sweet_female"
+    )
+    
+    print(f"视频合成完成: {output}")
+    return output
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    example_hairclip_video()