feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow
- main_flow.py: CLI tool with argparse support
- modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.)
- config.py: Configuration with API keys and paths
- requirements.txt: Python dependencies
- docs/: System prompt documentation
This commit is contained in:
Tony Zhang
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions

801
modules/factory.py Normal file
View File

@@ -0,0 +1,801 @@
"""
MatchMe Studio - Factory Module (Concurrent Scene Generation)
Using Volcengine (Doubao) API for Image and Video
"""
import os
import time
import logging
import requests
import json
import re
import base64
import subprocess
from pathlib import Path
from typing import Dict, Any, List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from elevenlabs import ElevenLabs, VoiceSettings
from openai import OpenAI
import config
from modules import storage
logger = logging.getLogger(__name__)
# Initialize OpenAI Client for Volcengine Image Generation
client = OpenAI(
api_key=config.VOLC_API_KEY,
base_url=config.VOLC_BASE_URL
)
# ============================================================
# Helper Functions
# ============================================================
def _download_as_base64(url: str) -> str:
"""Download image from URL and convert to Base64."""
try:
response = requests.get(url)
response.raise_for_status()
return base64.b64encode(response.content).decode('utf-8')
except Exception as e:
logger.error(f"Failed to download/encode image: {e}")
return ""
# ============================================================
# Image Generation (Doubao / Volcengine)
# ============================================================
def generate_scene_image(
scene: Dict[str, Any],
brief: Dict[str, Any] = None,
reference_images: List[str] = None
) -> str:
"""
Generate image using Volcengine API (Doubao Image).
Using raw requests to match user's curl example exactly.
"""
# Build prompt
image_prompt = scene.get("image_prompt", "")
if not image_prompt:
# Fallback prompt construction
keyframe = scene.get("keyframe", {})
# Stronger style consistency intro
parts = ["Cinematic shot, 8k, photorealistic"]
if brief:
if brief.get("product_visual_description"):
parts.append(f"Product: {brief['product_visual_description']}")
parts.extend([
f"Subject: {keyframe.get('subject', 'product')}",
f"Environment: {keyframe.get('environment', 'studio')}",
f"Action: {keyframe.get('focus', '')}"
])
image_prompt = ", ".join(parts)
# Append explicit consistency enforcement to prompt
if brief and brief.get("product_visual_description"):
if brief['product_visual_description'] not in image_prompt:
image_prompt = f"{brief['product_visual_description']}, {image_prompt}"
logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")
url = f"{config.VOLC_BASE_URL}/images/generations"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
# Payload matching user's curl example
payload = {
"model": config.IMAGE_MODEL_ID,
"prompt": image_prompt,
"sequential_image_generation": "disabled",
"response_format": "b64_json", # Use base64 to avoid temp url expiration issues
"size": "2K", # User specified 2K
"stream": False,
"watermark": True
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code != 200:
logger.error(f"Image API Error: {response.text}")
raise ValueError(f"Image API failed: {response.status_code} - {response.text}")
data = response.json()
# Extract Image Data
image_data = None
if "data" in data and len(data["data"]) > 0:
image_data = data["data"][0].get("b64_json")
if not image_data:
# Fallback to URL download if b64 not present
img_url = data["data"][0].get("url")
if img_url:
# Download the image to ensure we have it locally
image_data = _download_as_base64(img_url)
if not image_data:
raise ValueError("No image data returned")
# Decode and Save
filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
local_path = config.TEMP_DIR / filename
with open(local_path, "wb") as f:
f.write(base64.b64decode(image_data))
# Upload to R2
r2_url = storage.upload_file(str(local_path))
logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
return r2_url
except Exception as e:
logger.error(f"Image Generation Failed: {e}")
raise
def generate_all_scene_images_concurrent(
scenes: List[Dict[str, Any]],
brief: Dict[str, Any] = None,
reference_images: List[str] = None,
max_workers: int = 3
) -> List[str]:
"""Generate images for all scenes concurrently."""
logger.info(f"Generating {len(scenes)} images concurrently...")
image_urls = [None] * len(scenes)
def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
url = generate_scene_image(scene, brief, reference_images)
return index, url
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(generate_single, i, scene): i
for i, scene in enumerate(scenes)
}
for future in as_completed(futures):
index = futures[future]
try:
_, url = future.result()
image_urls[index] = url
except Exception as e:
logger.error(f"Scene {index+1} failed: {e}")
return image_urls
# ============================================================
# Video Generation (Doubao Video / PixelDance)
# ============================================================
def generate_scene_video(
start_frame_url: str,
motion_prompt: str,
duration: int = 5
) -> str:
"""
Generate video using Volcengine API (Async Task Flow).
"""
logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")
# 1. Create Task
create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {config.VOLC_API_KEY}"
}
# Construct Content List (Text + Optional Image)
content_list = [
{
"type": "text",
"text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
}
]
if start_frame_url:
content_list.append({
"type": "image_url",
"image_url": {"url": start_frame_url}
})
payload = {
"model": config.VIDEO_MODEL_ID,
"content": content_list
}
try:
response = requests.post(create_url, headers=headers, json=payload, timeout=30)
if response.status_code != 200:
# 202 Accepted is also possible for async tasks
if response.status_code != 202:
logger.error(f"Video Task Creation Error: {response.text}")
raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")
data = response.json()
task_id = data.get("id")
if not task_id:
# Sometimes ID is in data.id or similar
task_id = data.get("data", {}).get("id")
if not task_id:
raise ValueError(f"No Task ID returned: {data}")
logger.info(f"Video Task Created: {task_id}. Polling for result...")
# 2. Poll for Result
# GET /contents/generations/tasks/{id}
max_retries = 60 # 5 mins max (5s interval)
video_url = None
for _ in range(max_retries):
time.sleep(5)
status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
resp = requests.get(status_url, headers=headers, timeout=30)
if resp.status_code == 200:
res_data = resp.json()
# Check status
# Structure usually: data.status = "succeeded" / "running" / "failed"
# Or top level status
status = res_data.get("status")
if not status and "data" in res_data:
status = res_data["data"].get("status")
if status == "succeeded" or status == "SUCCEEDED":
# Extract URL
content = res_data.get("data", {}).get("content", [])
if not content and "content" in res_data:
content = res_data["content"]
# Find video url in content
# Content is usually list of dicts with type='video' or 'video_url'
for item in content:
if item.get("video_url"):
video_url = item["video_url"]
break
if item.get("url"): # sometimes just url
video_url = item["url"]
break
if video_url:
break
elif status == "failed" or status == "FAILED":
reason = res_data.get("data", {}).get("error", "Unknown error")
raise ValueError(f"Video Generation Failed: {reason}")
# If running/queued, continue waiting
if not video_url:
raise TimeoutError("Video generation timed out or failed to return URL.")
# 3. Download and Upload to R2
logger.info(f"Video Generated. Downloading: {video_url}")
filename = f"vid_doubao_{int(time.time())}.mp4"
local_path = config.TEMP_DIR / filename
resp = requests.get(video_url, stream=True)
if resp.status_code != 200:
raise ValueError(f"Failed to download generated video: {resp.status_code}")
with open(local_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
r2_url = storage.upload_file(str(local_path))
return r2_url
except Exception as e:
logger.error(f"Video Generation Error: {e}")
raise
def generate_all_scene_videos_concurrent(
scenes: List[Dict[str, Any]],
image_urls: List[str],
max_workers: int = 2
) -> List[str]:
"""Generate videos concurrently."""
logger.info(f"Generating {len(scenes)} videos concurrently...")
video_urls = [None] * len(scenes)
def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
motion = scene.get("camera_movement", "slow zoom")
if scene.get("image_prompt"):
motion = f"{scene['image_prompt']}. {motion}"
duration = scene.get("duration", 5)
url = generate_scene_video(img_url, motion, duration)
return index, url
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(generate_single, i, scene, image_urls[i]): i
for i, scene in enumerate(scenes)
}
for future in as_completed(futures):
index = futures[future]
try:
_, url = future.result()
video_urls[index] = url
except Exception as e:
logger.error(f"Scene {index+1} video failed: {e}")
return video_urls
# ============================================================
# Audio Generation (ElevenLabs)
# ============================================================
def generate_voiceover(text: str, style: str = "") -> str:
"""Generate voiceover audio. Returns R2 URL."""
if not text or not text.strip():
return ""
stability = 0.3 if "ASMR" in style else 0.5
similarity = 0.9 if "ASMR" in style else 0.8
logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")
try:
el_client = ElevenLabs(api_key=config.XI_KEY)
audio_stream = el_client.text_to_speech.convert(
voice_id=config.ELEVENLABS_VOICE_ID,
text=text,
model_id=config.ELEVENLABS_MODEL,
voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
)
filename = f"vo_{int(time.time())}.mp3"
local_path = config.TEMP_DIR / filename
with open(local_path, "wb") as f:
for chunk in audio_stream:
f.write(chunk)
r2_url = storage.upload_file(str(local_path))
return r2_url
except Exception as e:
logger.error(f"Voiceover failed: {e}")
return ""
def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
"""Generate combined voiceover for all scenes."""
voiceovers = []
for s in scenes:
vo = s.get("voiceover", "")
if vo and vo.strip() and not vo.startswith("("):
voiceovers.append(vo.strip())
if not voiceovers:
return ""
full_text = " ".join(voiceovers)
return generate_voiceover(full_text, style)
# ============================================================
# Audio Generation (Edge TTS - 免费中文语音合成)
# ============================================================
# Edge TTS 中文音色预设 (免费,效果好)
EDGE_TTS_VOICES = {
# 女声
"sweet_female": "zh-CN-XiaoxiaoNeural", # 晓晓 - 甜美活泼(推荐)
"gentle_female": "zh-CN-XiaoyiNeural", # 晓伊 - 温柔知性
"lively_female": "zh-CN-XiaochenNeural", # 晓辰 - 活泼可爱
"broadcast_female": "zh-CN-XiaoqiuNeural", # 晓秋 - 新闻播报
# 男声
"general_male": "zh-CN-YunxiNeural", # 云希 - 温暖男声
"broadcast_male": "zh-CN-YunjianNeural", # 云健 - 专业播报
}
# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
VOLC_TTS_VOICES = {
# 抖音带货友好女声
"sweet_female": "zh_female_vv_uranus_bigtts", # viv 2.0 通用女声(甜美)
"lively_female": "zh_female_jitangnv_saturn_bigtts", # 鸡汤女(元气)
"broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘(新闻播报)- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
"meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
# 男声
"general_male": "zh_male_dayi_saturn_bigtts", # 大义(沉稳男声)
}
def generate_voiceover_edge(
text: str,
voice_type: str = "sweet_female",
rate: str = "+0%",
volume: str = "+0%",
output_path: str = None
) -> str:
"""
使用 Edge TTS 生成中文旁白(免费,效果好)
Args:
text: 旁白文本
voice_type: 音色类型(见 EDGE_TTS_VOICES或直接使用音色名
rate: 语速调整,如 "+10%", "-20%"
volume: 音量调整,如 "+10%", "-20%"
output_path: 输出路径
Returns:
音频文件路径
"""
import asyncio
import edge_tts
if not text or not text.strip():
logger.warning("Empty text provided for TTS")
return ""
# 获取音色
voice = EDGE_TTS_VOICES.get(voice_type, voice_type)
logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")
if not output_path:
filename = f"vo_edge_{int(time.time())}.mp3"
output_path = str(config.TEMP_DIR / filename)
async def _generate():
communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
await communicate.save(output_path)
# Simple retry logic for Edge TTS
max_retries = 3
for i in range(max_retries):
try:
asyncio.run(_generate())
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
logger.info(f"Edge TTS voiceover generated: {output_path}")
return output_path
except Exception as e:
logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
time.sleep(1.0) # wait before retry
logger.error("Edge TTS failed after retries.")
return ""
def generate_voiceover_volcengine_ws(
text: str,
voice_type: str = "sweet_female",
output_path: str = None,
timeout: int = 120
) -> str:
"""
使用火山 WebSocket Binary Demo 生成 TTS 音频
依赖目录:/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python
"""
if not text or not text.strip():
logger.warning("Empty text provided for TTS (ws)")
return ""
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
venv_python = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python")
demo_script = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/examples/volcengine/binary.py")
if not venv_python.exists() or not demo_script.exists():
logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
return ""
if not output_path:
output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")
cmd = [
str(venv_python),
str(demo_script),
"--appid", config.VOLC_TTS_APPID,
"--access_token", config.VOLC_TTS_ACCESS_TOKEN,
"--voice_type", voice_id,
"--text", text,
"--encoding", "mp3",
]
logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
try:
result = subprocess.run(
cmd,
cwd="/Volumes/Tony/video-flow/volcengine_binary_demo",
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
logger.error(f"Volc WS TTS failed: {result.stderr}")
return ""
# demo 保存在 cwd 下 voice_type.mp3
demo_out = Path("/Volumes/Tony/video-flow/volcengine_binary_demo") / f"{voice_id}.mp3"
if not demo_out.exists():
logger.error("Volc WS TTS output not found")
return ""
Path(output_path).write_bytes(demo_out.read_bytes())
logger.info(f"Volc WS TTS saved to {output_path}")
return output_path
except Exception as e:
logger.error(f"Volc WS TTS error: {e}")
return ""
def generate_voiceover_volcengine(
text: str,
voice_type: str = "sweet_female",
speed_ratio: float = 1.0,
volume_ratio: float = 1.0,
pitch_ratio: float = 1.0,
output_path: str = None
) -> str:
"""
使用火山引擎 TTS 生成中文旁白
Args:
text: 旁白文本
voice_type: 音色类型(见 VOLC_TTS_VOICES或直接使用音色 ID
speed_ratio: 语速0.5-2.0默认1.0
volume_ratio: 音量0.5-2.0默认1.0
pitch_ratio: 音调0.5-2.0默认1.0
output_path: 输出路径(可选,默认自动生成)
Returns:
音频文件路径
"""
import uuid
if not text or not text.strip():
logger.warning("Empty text provided for TTS")
return ""
# 获取音色 ID火山音色表 + fallback 自定义)
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")
# 先尝试 WebSocket Binary官方 demo 已验证可用)
ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
if ws_path:
return ws_path
# 若 WS 异常,再尝试 HTTP
url = "https://openspeech.bytedance.com/api/v1/tts"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
}
payload = {
"app": {
"appid": config.VOLC_TTS_APPID,
"token": config.VOLC_TTS_ACCESS_TOKEN,
"cluster": "volcano_tts"
},
"user": {
"uid": "video_flow_user"
},
"audio": {
"voice_type": voice_id,
"encoding": "mp3",
"speed_ratio": speed_ratio,
"volume_ratio": volume_ratio,
"pitch_ratio": pitch_ratio
},
"request": {
"reqid": str(uuid.uuid4()),
"text": text,
"text_type": "plain",
"operation": "query",
"with_timestamp": "1",
"extra_param": json.dumps({
"disable_markdown_filter": False
})
}
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code != 200:
logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
data = response.json()
ret_code = data.get("code")
if ret_code not in (0, 3000, 20000000):
error_msg = data.get("message", "Unknown error")
logger.error(f"Volcengine TTS Error: {error_msg}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
audio_data = data.get("data", "")
if not audio_data:
raise ValueError("No audio data returned")
if not output_path:
filename = f"vo_volc_{int(time.time())}.mp3"
output_path = str(config.TEMP_DIR / filename)
with open(output_path, "wb") as f:
f.write(base64.b64decode(audio_data))
logger.info(f"Voiceover generated (HTTP): {output_path}")
return output_path
except Exception as e:
logger.error(f"Volcengine TTS HTTP error: {e}")
# Fallback to Edge TTS with a safe default voice
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
def generate_voiceover_volcengine_long(
text: str,
voice_type: str = "sweet_female",
speed_ratio: float = 1.0,
output_path: str = None,
max_chunk_length: int = 300
) -> str:
"""
火山引擎 TTS 长文本处理(自动分段合成)
对于超过 max_chunk_length 的文本,自动分段合成后拼接
"""
if len(text) <= max_chunk_length:
return generate_voiceover_volcengine(
text=text,
voice_type=voice_type,
speed_ratio=speed_ratio,
output_path=output_path
)
logger.info(f"Long text ({len(text)} chars), splitting into chunks...")
# 按句子分段
import re
sentences = re.split(r'([。!?;.!?;])', text)
chunks = []
current_chunk = ""
for i in range(0, len(sentences) - 1, 2):
sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
if len(current_chunk) + len(sentence) <= max_chunk_length:
current_chunk += sentence
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
# 如果最后一段是奇数句子
if len(sentences) % 2 == 1 and sentences[-1]:
if chunks:
chunks[-1] += sentences[-1]
else:
chunks.append(sentences[-1])
logger.info(f"Split into {len(chunks)} chunks")
# 生成每段音频
chunk_files = []
for i, chunk in enumerate(chunks):
chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
try:
path = generate_voiceover_volcengine(
text=chunk,
voice_type=voice_type,
speed_ratio=speed_ratio,
output_path=chunk_path
)
chunk_files.append(path)
except Exception as e:
logger.error(f"Chunk {i} failed: {e}")
# 继续处理其他段落
if not chunk_files:
raise ValueError("All TTS chunks failed")
# 使用 FFmpeg 合并音频
if len(chunk_files) == 1:
if output_path:
import shutil
shutil.move(chunk_files[0], output_path)
return output_path
return chunk_files[0]
# 创建合并文件列表
concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
with open(concat_list, "w") as f:
for cf in chunk_files:
f.write(f"file '{cf}'\n")
if not output_path:
output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")
# FFmpeg 合并
import subprocess
cmd = [
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-i", str(concat_list),
"-c", "copy",
output_path
]
subprocess.run(cmd, capture_output=True, check=True)
# 清理临时文件
for cf in chunk_files:
try:
os.remove(cf)
except:
pass
concat_list.unlink(missing_ok=True)
logger.info(f"Merged voiceover: {output_path}")
return output_path
def generate_scene_voiceovers_volcengine(
scenes: List[Dict[str, Any]],
voice_type: str = "sweet_female",
output_dir: str = None
) -> List[str]:
"""
为每个场景单独生成旁白音频
Args:
scenes: 场景列表,每个场景包含 voiceover 字段
voice_type: 音色类型
output_dir: 输出目录
Returns:
音频文件路径列表
"""
if output_dir:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
else:
output_dir = config.TEMP_DIR
audio_paths = []
for i, scene in enumerate(scenes):
vo_text = scene.get("voiceover", "")
if not vo_text or not vo_text.strip() or vo_text.startswith("("):
# 无旁白或是注释
audio_paths.append("")
continue
try:
output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
path = generate_voiceover_volcengine(
text=vo_text.strip(),
voice_type=voice_type,
output_path=output_path
)
audio_paths.append(path)
except Exception as e:
logger.error(f"Scene {i+1} voiceover failed: {e}")
audio_paths.append("")
return audio_paths