feat: video-flow initial commit
- app.py: Streamlit UI for video generation workflow - main_flow.py: CLI tool with argparse support - modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.) - config.py: Configuration with API keys and paths - requirements.txt: Python dependencies - docs/: System prompt documentation
This commit is contained in:
801
modules/factory.py
Normal file
801
modules/factory.py
Normal file
@@ -0,0 +1,801 @@
|
||||
"""
|
||||
MatchMe Studio - Factory Module (Concurrent Scene Generation)
|
||||
Using Volcengine (Doubao) API for Image and Video
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import base64
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from elevenlabs import ElevenLabs, VoiceSettings
|
||||
from openai import OpenAI
|
||||
|
||||
import config
|
||||
from modules import storage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Initialize OpenAI Client for Volcengine Image Generation
|
||||
client = OpenAI(
|
||||
api_key=config.VOLC_API_KEY,
|
||||
base_url=config.VOLC_BASE_URL
|
||||
)
|
||||
|
||||
# ============================================================
|
||||
# Helper Functions
|
||||
# ============================================================
|
||||
|
||||
def _download_as_base64(url: str) -> str:
|
||||
"""Download image from URL and convert to Base64."""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return base64.b64encode(response.content).decode('utf-8')
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download/encode image: {e}")
|
||||
return ""
|
||||
|
||||
# ============================================================
|
||||
# Image Generation (Doubao / Volcengine)
|
||||
# ============================================================
|
||||
|
||||
def generate_scene_image(
|
||||
scene: Dict[str, Any],
|
||||
brief: Dict[str, Any] = None,
|
||||
reference_images: List[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate image using Volcengine API (Doubao Image).
|
||||
Using raw requests to match user's curl example exactly.
|
||||
"""
|
||||
# Build prompt
|
||||
image_prompt = scene.get("image_prompt", "")
|
||||
if not image_prompt:
|
||||
# Fallback prompt construction
|
||||
keyframe = scene.get("keyframe", {})
|
||||
# Stronger style consistency intro
|
||||
parts = ["Cinematic shot, 8k, photorealistic"]
|
||||
if brief:
|
||||
if brief.get("product_visual_description"):
|
||||
parts.append(f"Product: {brief['product_visual_description']}")
|
||||
parts.extend([
|
||||
f"Subject: {keyframe.get('subject', 'product')}",
|
||||
f"Environment: {keyframe.get('environment', 'studio')}",
|
||||
f"Action: {keyframe.get('focus', '')}"
|
||||
])
|
||||
image_prompt = ", ".join(parts)
|
||||
|
||||
# Append explicit consistency enforcement to prompt
|
||||
if brief and brief.get("product_visual_description"):
|
||||
if brief['product_visual_description'] not in image_prompt:
|
||||
image_prompt = f"{brief['product_visual_description']}, {image_prompt}"
|
||||
|
||||
logger.info(f"Generating image (Volcengine): {image_prompt[:50]}...")
|
||||
|
||||
url = f"{config.VOLC_BASE_URL}/images/generations"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {config.VOLC_API_KEY}"
|
||||
}
|
||||
|
||||
# Payload matching user's curl example
|
||||
payload = {
|
||||
"model": config.IMAGE_MODEL_ID,
|
||||
"prompt": image_prompt,
|
||||
"sequential_image_generation": "disabled",
|
||||
"response_format": "b64_json", # Use base64 to avoid temp url expiration issues
|
||||
"size": "2K", # User specified 2K
|
||||
"stream": False,
|
||||
"watermark": True
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=60)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Image API Error: {response.text}")
|
||||
raise ValueError(f"Image API failed: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Extract Image Data
|
||||
image_data = None
|
||||
if "data" in data and len(data["data"]) > 0:
|
||||
image_data = data["data"][0].get("b64_json")
|
||||
if not image_data:
|
||||
# Fallback to URL download if b64 not present
|
||||
img_url = data["data"][0].get("url")
|
||||
if img_url:
|
||||
# Download the image to ensure we have it locally
|
||||
image_data = _download_as_base64(img_url)
|
||||
|
||||
if not image_data:
|
||||
raise ValueError("No image data returned")
|
||||
|
||||
# Decode and Save
|
||||
filename = f"scene_{scene.get('id', 0)}_{int(time.time())}.jpg"
|
||||
local_path = config.TEMP_DIR / filename
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(base64.b64decode(image_data))
|
||||
|
||||
# Upload to R2
|
||||
r2_url = storage.upload_file(str(local_path))
|
||||
logger.info(f"Scene {scene.get('id', '?')} image uploaded: {r2_url}")
|
||||
return r2_url
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image Generation Failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def generate_all_scene_images_concurrent(
|
||||
scenes: List[Dict[str, Any]],
|
||||
brief: Dict[str, Any] = None,
|
||||
reference_images: List[str] = None,
|
||||
max_workers: int = 3
|
||||
) -> List[str]:
|
||||
"""Generate images for all scenes concurrently."""
|
||||
logger.info(f"Generating {len(scenes)} images concurrently...")
|
||||
image_urls = [None] * len(scenes)
|
||||
|
||||
def generate_single(index: int, scene: Dict[str, Any]) -> tuple:
|
||||
url = generate_scene_image(scene, brief, reference_images)
|
||||
return index, url
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {
|
||||
executor.submit(generate_single, i, scene): i
|
||||
for i, scene in enumerate(scenes)
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
index = futures[future]
|
||||
try:
|
||||
_, url = future.result()
|
||||
image_urls[index] = url
|
||||
except Exception as e:
|
||||
logger.error(f"Scene {index+1} failed: {e}")
|
||||
|
||||
return image_urls
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Video Generation (Doubao Video / PixelDance)
|
||||
# ============================================================
|
||||
|
||||
def generate_scene_video(
|
||||
start_frame_url: str,
|
||||
motion_prompt: str,
|
||||
duration: int = 5
|
||||
) -> str:
|
||||
"""
|
||||
Generate video using Volcengine API (Async Task Flow).
|
||||
"""
|
||||
logger.info(f"Generating video (Volcengine): {motion_prompt[:50]}...")
|
||||
|
||||
# 1. Create Task
|
||||
create_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {config.VOLC_API_KEY}"
|
||||
}
|
||||
|
||||
# Construct Content List (Text + Optional Image)
|
||||
content_list = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"{motion_prompt} --resolution 1080p --duration {duration} --camerafixed false --watermark true"
|
||||
}
|
||||
]
|
||||
|
||||
if start_frame_url:
|
||||
content_list.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": start_frame_url}
|
||||
})
|
||||
|
||||
payload = {
|
||||
"model": config.VIDEO_MODEL_ID,
|
||||
"content": content_list
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(create_url, headers=headers, json=payload, timeout=30)
|
||||
if response.status_code != 200:
|
||||
# 202 Accepted is also possible for async tasks
|
||||
if response.status_code != 202:
|
||||
logger.error(f"Video Task Creation Error: {response.text}")
|
||||
raise ValueError(f"Video Task failed: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
task_id = data.get("id")
|
||||
if not task_id:
|
||||
# Sometimes ID is in data.id or similar
|
||||
task_id = data.get("data", {}).get("id")
|
||||
|
||||
if not task_id:
|
||||
raise ValueError(f"No Task ID returned: {data}")
|
||||
|
||||
logger.info(f"Video Task Created: {task_id}. Polling for result...")
|
||||
|
||||
# 2. Poll for Result
|
||||
# GET /contents/generations/tasks/{id}
|
||||
max_retries = 60 # 5 mins max (5s interval)
|
||||
video_url = None
|
||||
|
||||
for _ in range(max_retries):
|
||||
time.sleep(5)
|
||||
status_url = f"{config.VOLC_BASE_URL}/contents/generations/tasks/{task_id}"
|
||||
resp = requests.get(status_url, headers=headers, timeout=30)
|
||||
|
||||
if resp.status_code == 200:
|
||||
res_data = resp.json()
|
||||
# Check status
|
||||
# Structure usually: data.status = "succeeded" / "running" / "failed"
|
||||
# Or top level status
|
||||
|
||||
status = res_data.get("status")
|
||||
if not status and "data" in res_data:
|
||||
status = res_data["data"].get("status")
|
||||
|
||||
if status == "succeeded" or status == "SUCCEEDED":
|
||||
# Extract URL
|
||||
content = res_data.get("data", {}).get("content", [])
|
||||
if not content and "content" in res_data:
|
||||
content = res_data["content"]
|
||||
|
||||
# Find video url in content
|
||||
# Content is usually list of dicts with type='video' or 'video_url'
|
||||
for item in content:
|
||||
if item.get("video_url"):
|
||||
video_url = item["video_url"]
|
||||
break
|
||||
if item.get("url"): # sometimes just url
|
||||
video_url = item["url"]
|
||||
break
|
||||
|
||||
if video_url:
|
||||
break
|
||||
elif status == "failed" or status == "FAILED":
|
||||
reason = res_data.get("data", {}).get("error", "Unknown error")
|
||||
raise ValueError(f"Video Generation Failed: {reason}")
|
||||
|
||||
# If running/queued, continue waiting
|
||||
|
||||
if not video_url:
|
||||
raise TimeoutError("Video generation timed out or failed to return URL.")
|
||||
|
||||
# 3. Download and Upload to R2
|
||||
logger.info(f"Video Generated. Downloading: {video_url}")
|
||||
filename = f"vid_doubao_{int(time.time())}.mp4"
|
||||
local_path = config.TEMP_DIR / filename
|
||||
|
||||
resp = requests.get(video_url, stream=True)
|
||||
if resp.status_code != 200:
|
||||
raise ValueError(f"Failed to download generated video: {resp.status_code}")
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
r2_url = storage.upload_file(str(local_path))
|
||||
return r2_url
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Video Generation Error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def generate_all_scene_videos_concurrent(
|
||||
scenes: List[Dict[str, Any]],
|
||||
image_urls: List[str],
|
||||
max_workers: int = 2
|
||||
) -> List[str]:
|
||||
"""Generate videos concurrently."""
|
||||
logger.info(f"Generating {len(scenes)} videos concurrently...")
|
||||
video_urls = [None] * len(scenes)
|
||||
|
||||
def generate_single(index: int, scene: Dict[str, Any], img_url: str) -> tuple:
|
||||
motion = scene.get("camera_movement", "slow zoom")
|
||||
if scene.get("image_prompt"):
|
||||
motion = f"{scene['image_prompt']}. {motion}"
|
||||
|
||||
duration = scene.get("duration", 5)
|
||||
url = generate_scene_video(img_url, motion, duration)
|
||||
return index, url
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {
|
||||
executor.submit(generate_single, i, scene, image_urls[i]): i
|
||||
for i, scene in enumerate(scenes)
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
index = futures[future]
|
||||
try:
|
||||
_, url = future.result()
|
||||
video_urls[index] = url
|
||||
except Exception as e:
|
||||
logger.error(f"Scene {index+1} video failed: {e}")
|
||||
|
||||
return video_urls
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Audio Generation (ElevenLabs)
|
||||
# ============================================================
|
||||
|
||||
def generate_voiceover(text: str, style: str = "") -> str:
|
||||
"""Generate voiceover audio. Returns R2 URL."""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
stability = 0.3 if "ASMR" in style else 0.5
|
||||
similarity = 0.9 if "ASMR" in style else 0.8
|
||||
|
||||
logger.info(f"Generating voiceover ({len(text)} chars, style={style})...")
|
||||
|
||||
try:
|
||||
el_client = ElevenLabs(api_key=config.XI_KEY)
|
||||
|
||||
audio_stream = el_client.text_to_speech.convert(
|
||||
voice_id=config.ELEVENLABS_VOICE_ID,
|
||||
text=text,
|
||||
model_id=config.ELEVENLABS_MODEL,
|
||||
voice_settings=VoiceSettings(stability=stability, similarity_boost=similarity)
|
||||
)
|
||||
|
||||
filename = f"vo_{int(time.time())}.mp3"
|
||||
local_path = config.TEMP_DIR / filename
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in audio_stream:
|
||||
f.write(chunk)
|
||||
|
||||
r2_url = storage.upload_file(str(local_path))
|
||||
return r2_url
|
||||
except Exception as e:
|
||||
logger.error(f"Voiceover failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def generate_full_voiceover(scenes: List[Dict[str, Any]], style: str = "") -> str:
|
||||
"""Generate combined voiceover for all scenes."""
|
||||
voiceovers = []
|
||||
for s in scenes:
|
||||
vo = s.get("voiceover", "")
|
||||
if vo and vo.strip() and not vo.startswith("("):
|
||||
voiceovers.append(vo.strip())
|
||||
|
||||
if not voiceovers:
|
||||
return ""
|
||||
|
||||
full_text = " ".join(voiceovers)
|
||||
return generate_voiceover(full_text, style)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Audio Generation (Edge TTS - 免费中文语音合成)
|
||||
# ============================================================
|
||||
|
||||
# Edge TTS 中文音色预设 (免费,效果好)
|
||||
EDGE_TTS_VOICES = {
|
||||
# 女声
|
||||
"sweet_female": "zh-CN-XiaoxiaoNeural", # 晓晓 - 甜美活泼(推荐)
|
||||
"gentle_female": "zh-CN-XiaoyiNeural", # 晓伊 - 温柔知性
|
||||
"lively_female": "zh-CN-XiaochenNeural", # 晓辰 - 活泼可爱
|
||||
"broadcast_female": "zh-CN-XiaoqiuNeural", # 晓秋 - 新闻播报
|
||||
# 男声
|
||||
"general_male": "zh-CN-YunxiNeural", # 云希 - 温暖男声
|
||||
"broadcast_male": "zh-CN-YunjianNeural", # 云健 - 专业播报
|
||||
}
|
||||
|
||||
# 火山引擎 TTS 音色预设 (需开通服务) - 选择抖音带货友好的音色
|
||||
VOLC_TTS_VOICES = {
|
||||
# 抖音带货友好女声
|
||||
"sweet_female": "zh_female_vv_uranus_bigtts", # viv 2.0 通用女声(甜美)
|
||||
"lively_female": "zh_female_jitangnv_saturn_bigtts", # 鸡汤女(元气)
|
||||
"broadcast_female": "zh_male_ruyaichen_saturn_bigtts", # 入雅尘(新闻播报)- 若需女声播报可换 zh_female_meilinyou_saturn_bigtts
|
||||
"meilinvyou": "zh_female_meilinvyou_saturn_bigtts",
|
||||
# 男声
|
||||
"general_male": "zh_male_dayi_saturn_bigtts", # 大义(沉稳男声)
|
||||
}
|
||||
|
||||
|
||||
def generate_voiceover_edge(
|
||||
text: str,
|
||||
voice_type: str = "sweet_female",
|
||||
rate: str = "+0%",
|
||||
volume: str = "+0%",
|
||||
output_path: str = None
|
||||
) -> str:
|
||||
"""
|
||||
使用 Edge TTS 生成中文旁白(免费,效果好)
|
||||
|
||||
Args:
|
||||
text: 旁白文本
|
||||
voice_type: 音色类型(见 EDGE_TTS_VOICES)或直接使用音色名
|
||||
rate: 语速调整,如 "+10%", "-20%"
|
||||
volume: 音量调整,如 "+10%", "-20%"
|
||||
output_path: 输出路径
|
||||
|
||||
Returns:
|
||||
音频文件路径
|
||||
"""
|
||||
import asyncio
|
||||
import edge_tts
|
||||
|
||||
if not text or not text.strip():
|
||||
logger.warning("Empty text provided for TTS")
|
||||
return ""
|
||||
|
||||
# 获取音色
|
||||
voice = EDGE_TTS_VOICES.get(voice_type, voice_type)
|
||||
|
||||
logger.info(f"Generating voiceover (Edge TTS): {len(text)} chars, voice={voice}")
|
||||
|
||||
if not output_path:
|
||||
filename = f"vo_edge_{int(time.time())}.mp3"
|
||||
output_path = str(config.TEMP_DIR / filename)
|
||||
|
||||
async def _generate():
|
||||
communicate = edge_tts.Communicate(text, voice, rate=rate, volume=volume)
|
||||
await communicate.save(output_path)
|
||||
|
||||
# Simple retry logic for Edge TTS
|
||||
max_retries = 3
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
asyncio.run(_generate())
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
||||
logger.info(f"Edge TTS voiceover generated: {output_path}")
|
||||
return output_path
|
||||
except Exception as e:
|
||||
logger.warning(f"Edge TTS attempt {i+1} failed: {e}")
|
||||
time.sleep(1.0) # wait before retry
|
||||
|
||||
logger.error("Edge TTS failed after retries.")
|
||||
return ""
|
||||
|
||||
|
||||
def generate_voiceover_volcengine_ws(
|
||||
text: str,
|
||||
voice_type: str = "sweet_female",
|
||||
output_path: str = None,
|
||||
timeout: int = 120
|
||||
) -> str:
|
||||
"""
|
||||
使用火山 WebSocket Binary Demo 生成 TTS 音频
|
||||
依赖目录:/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
logger.warning("Empty text provided for TTS (ws)")
|
||||
return ""
|
||||
|
||||
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
|
||||
|
||||
venv_python = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/.venv/bin/python")
|
||||
demo_script = Path("/Volumes/Tony/video-flow/volcengine_binary_demo/examples/volcengine/binary.py")
|
||||
|
||||
if not venv_python.exists() or not demo_script.exists():
|
||||
logger.error("Volcengine WS demo or venv not found. Please install under volcengine_binary_demo/.venv")
|
||||
return ""
|
||||
|
||||
if not output_path:
|
||||
output_path = str(config.TEMP_DIR / f"vo_volc_ws_{int(time.time())}.mp3")
|
||||
|
||||
cmd = [
|
||||
str(venv_python),
|
||||
str(demo_script),
|
||||
"--appid", config.VOLC_TTS_APPID,
|
||||
"--access_token", config.VOLC_TTS_ACCESS_TOKEN,
|
||||
"--voice_type", voice_id,
|
||||
"--text", text,
|
||||
"--encoding", "mp3",
|
||||
]
|
||||
|
||||
logger.info(f"Calling Volcengine WS TTS: voice={voice_id}, len={len(text)}")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd="/Volumes/Tony/video-flow/volcengine_binary_demo",
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.error(f"Volc WS TTS failed: {result.stderr}")
|
||||
return ""
|
||||
|
||||
# demo 保存在 cwd 下 voice_type.mp3
|
||||
demo_out = Path("/Volumes/Tony/video-flow/volcengine_binary_demo") / f"{voice_id}.mp3"
|
||||
if not demo_out.exists():
|
||||
logger.error("Volc WS TTS output not found")
|
||||
return ""
|
||||
|
||||
Path(output_path).write_bytes(demo_out.read_bytes())
|
||||
logger.info(f"Volc WS TTS saved to {output_path}")
|
||||
return output_path
|
||||
except Exception as e:
|
||||
logger.error(f"Volc WS TTS error: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def generate_voiceover_volcengine(
|
||||
text: str,
|
||||
voice_type: str = "sweet_female",
|
||||
speed_ratio: float = 1.0,
|
||||
volume_ratio: float = 1.0,
|
||||
pitch_ratio: float = 1.0,
|
||||
output_path: str = None
|
||||
) -> str:
|
||||
"""
|
||||
使用火山引擎 TTS 生成中文旁白
|
||||
|
||||
Args:
|
||||
text: 旁白文本
|
||||
voice_type: 音色类型(见 VOLC_TTS_VOICES)或直接使用音色 ID
|
||||
speed_ratio: 语速(0.5-2.0,默认1.0)
|
||||
volume_ratio: 音量(0.5-2.0,默认1.0)
|
||||
pitch_ratio: 音调(0.5-2.0,默认1.0)
|
||||
output_path: 输出路径(可选,默认自动生成)
|
||||
|
||||
Returns:
|
||||
音频文件路径
|
||||
"""
|
||||
import uuid
|
||||
|
||||
if not text or not text.strip():
|
||||
logger.warning("Empty text provided for TTS")
|
||||
return ""
|
||||
|
||||
# 获取音色 ID(火山音色表 + fallback 自定义)
|
||||
voice_id = VOLC_TTS_VOICES.get(voice_type, voice_type)
|
||||
|
||||
logger.info(f"Generating voiceover (Volcengine TTS): {len(text)} chars, voice={voice_id}")
|
||||
|
||||
# 先尝试 WebSocket Binary(官方 demo 已验证可用)
|
||||
ws_path = generate_voiceover_volcengine_ws(text, voice_type, output_path)
|
||||
if ws_path:
|
||||
return ws_path
|
||||
|
||||
# 若 WS 异常,再尝试 HTTP
|
||||
url = "https://openspeech.bytedance.com/api/v1/tts"
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer;{config.VOLC_TTS_ACCESS_TOKEN}"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"app": {
|
||||
"appid": config.VOLC_TTS_APPID,
|
||||
"token": config.VOLC_TTS_ACCESS_TOKEN,
|
||||
"cluster": "volcano_tts"
|
||||
},
|
||||
"user": {
|
||||
"uid": "video_flow_user"
|
||||
},
|
||||
"audio": {
|
||||
"voice_type": voice_id,
|
||||
"encoding": "mp3",
|
||||
"speed_ratio": speed_ratio,
|
||||
"volume_ratio": volume_ratio,
|
||||
"pitch_ratio": pitch_ratio
|
||||
},
|
||||
"request": {
|
||||
"reqid": str(uuid.uuid4()),
|
||||
"text": text,
|
||||
"text_type": "plain",
|
||||
"operation": "query",
|
||||
"with_timestamp": "1",
|
||||
"extra_param": json.dumps({
|
||||
"disable_markdown_filter": False
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=60)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Volcengine TTS Error: {response.status_code} - {response.text}")
|
||||
# Fallback to Edge TTS with a safe default voice
|
||||
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
|
||||
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
|
||||
|
||||
data = response.json()
|
||||
|
||||
ret_code = data.get("code")
|
||||
if ret_code not in (0, 3000, 20000000):
|
||||
error_msg = data.get("message", "Unknown error")
|
||||
logger.error(f"Volcengine TTS Error: {error_msg}")
|
||||
# Fallback to Edge TTS with a safe default voice
|
||||
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
|
||||
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
|
||||
|
||||
audio_data = data.get("data", "")
|
||||
if not audio_data:
|
||||
raise ValueError("No audio data returned")
|
||||
|
||||
if not output_path:
|
||||
filename = f"vo_volc_{int(time.time())}.mp3"
|
||||
output_path = str(config.TEMP_DIR / filename)
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(base64.b64decode(audio_data))
|
||||
|
||||
logger.info(f"Voiceover generated (HTTP): {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Volcengine TTS HTTP error: {e}")
|
||||
# Fallback to Edge TTS with a safe default voice
|
||||
fallback_voice = "sweet_female" if voice_type not in EDGE_TTS_VOICES else voice_type
|
||||
return generate_voiceover_edge(text, fallback_voice, output_path=output_path)
|
||||
|
||||
|
||||
def generate_voiceover_volcengine_long(
|
||||
text: str,
|
||||
voice_type: str = "sweet_female",
|
||||
speed_ratio: float = 1.0,
|
||||
output_path: str = None,
|
||||
max_chunk_length: int = 300
|
||||
) -> str:
|
||||
"""
|
||||
火山引擎 TTS 长文本处理(自动分段合成)
|
||||
|
||||
对于超过 max_chunk_length 的文本,自动分段合成后拼接
|
||||
"""
|
||||
if len(text) <= max_chunk_length:
|
||||
return generate_voiceover_volcengine(
|
||||
text=text,
|
||||
voice_type=voice_type,
|
||||
speed_ratio=speed_ratio,
|
||||
output_path=output_path
|
||||
)
|
||||
|
||||
logger.info(f"Long text ({len(text)} chars), splitting into chunks...")
|
||||
|
||||
# 按句子分段
|
||||
import re
|
||||
sentences = re.split(r'([。!?;.!?;])', text)
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for i in range(0, len(sentences) - 1, 2):
|
||||
sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
|
||||
|
||||
if len(current_chunk) + len(sentence) <= max_chunk_length:
|
||||
current_chunk += sentence
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = sentence
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# 如果最后一段是奇数句子
|
||||
if len(sentences) % 2 == 1 and sentences[-1]:
|
||||
if chunks:
|
||||
chunks[-1] += sentences[-1]
|
||||
else:
|
||||
chunks.append(sentences[-1])
|
||||
|
||||
logger.info(f"Split into {len(chunks)} chunks")
|
||||
|
||||
# 生成每段音频
|
||||
chunk_files = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_path = str(config.TEMP_DIR / f"vo_chunk_{i}_{int(time.time())}.mp3")
|
||||
try:
|
||||
path = generate_voiceover_volcengine(
|
||||
text=chunk,
|
||||
voice_type=voice_type,
|
||||
speed_ratio=speed_ratio,
|
||||
output_path=chunk_path
|
||||
)
|
||||
chunk_files.append(path)
|
||||
except Exception as e:
|
||||
logger.error(f"Chunk {i} failed: {e}")
|
||||
# 继续处理其他段落
|
||||
|
||||
if not chunk_files:
|
||||
raise ValueError("All TTS chunks failed")
|
||||
|
||||
# 使用 FFmpeg 合并音频
|
||||
if len(chunk_files) == 1:
|
||||
if output_path:
|
||||
import shutil
|
||||
shutil.move(chunk_files[0], output_path)
|
||||
return output_path
|
||||
return chunk_files[0]
|
||||
|
||||
# 创建合并文件列表
|
||||
concat_list = config.TEMP_DIR / f"concat_audio_{os.getpid()}.txt"
|
||||
with open(concat_list, "w") as f:
|
||||
for cf in chunk_files:
|
||||
f.write(f"file '{cf}'\n")
|
||||
|
||||
if not output_path:
|
||||
output_path = str(config.TEMP_DIR / f"vo_volc_merged_{int(time.time())}.mp3")
|
||||
|
||||
# FFmpeg 合并
|
||||
import subprocess
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-f", "concat",
|
||||
"-safe", "0",
|
||||
"-i", str(concat_list),
|
||||
"-c", "copy",
|
||||
output_path
|
||||
]
|
||||
|
||||
subprocess.run(cmd, capture_output=True, check=True)
|
||||
|
||||
# 清理临时文件
|
||||
for cf in chunk_files:
|
||||
try:
|
||||
os.remove(cf)
|
||||
except:
|
||||
pass
|
||||
concat_list.unlink(missing_ok=True)
|
||||
|
||||
logger.info(f"Merged voiceover: {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
def generate_scene_voiceovers_volcengine(
|
||||
scenes: List[Dict[str, Any]],
|
||||
voice_type: str = "sweet_female",
|
||||
output_dir: str = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
为每个场景单独生成旁白音频
|
||||
|
||||
Args:
|
||||
scenes: 场景列表,每个场景包含 voiceover 字段
|
||||
voice_type: 音色类型
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
音频文件路径列表
|
||||
"""
|
||||
if output_dir:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
else:
|
||||
output_dir = config.TEMP_DIR
|
||||
|
||||
audio_paths = []
|
||||
|
||||
for i, scene in enumerate(scenes):
|
||||
vo_text = scene.get("voiceover", "")
|
||||
|
||||
if not vo_text or not vo_text.strip() or vo_text.startswith("("):
|
||||
# 无旁白或是注释
|
||||
audio_paths.append("")
|
||||
continue
|
||||
|
||||
try:
|
||||
output_path = str(output_dir / f"scene_{i+1}_vo.mp3")
|
||||
path = generate_voiceover_volcengine(
|
||||
text=vo_text.strip(),
|
||||
voice_type=voice_type,
|
||||
output_path=output_path
|
||||
)
|
||||
audio_paths.append(path)
|
||||
except Exception as e:
|
||||
logger.error(f"Scene {i+1} voiceover failed: {e}")
|
||||
audio_paths.append("")
|
||||
|
||||
return audio_paths
|
||||
Reference in New Issue
Block a user