feat: video-flow initial commit

- app.py: Streamlit UI for video generation workflow
- main_flow.py: CLI tool with argparse support
- modules/: Business logic modules (script_gen, image_gen, video_gen, composer, etc.)
- config.py: Configuration with API keys and paths
- requirements.txt: Python dependencies
- docs/: System prompt documentation
This commit is contained in:
Tony Zhang
2025-12-12 19:18:27 +08:00
commit 33a165a615
34 changed files with 12012 additions and 0 deletions

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import uuid
import websockets
from protocols import MsgType, full_client_request, receive_message
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_cluster(voice: str) -> str:
if voice.startswith("S_"):
return "volcano_icl"
return "volcano_tts"
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--appid", required=True, help="APP ID")
parser.add_argument("--access_token", required=True, help="Access Token")
parser.add_argument("--voice_type", required=True, help="Voice type")
parser.add_argument("--cluster", default="", help="Cluster name")
parser.add_argument("--text", required=True, help="Text to convert")
parser.add_argument("--encoding", default="wav", help="Output file encoding")
parser.add_argument(
"--endpoint",
default="wss://openspeech.bytedance.com/api/v1/tts/ws_binary",
help="WebSocket endpoint URL",
)
args = parser.parse_args()
# Determine cluster
cluster = args.cluster if args.cluster else get_cluster(args.voice_type)
# Connect to server
headers = {
"Authorization": f"Bearer;{args.access_token}",
}
logger.info(f"Connecting to {args.endpoint} with headers: {headers}")
websocket = await websockets.connect(
args.endpoint, additional_headers=headers, max_size=10 * 1024 * 1024
)
logger.info(
f"Connected to WebSocket server, Logid: {websocket.response.headers['x-tt-logid']}",
)
try:
# Prepare request payload
request = {
"app": {
"appid": args.appid,
"token": args.access_token,
"cluster": cluster,
},
"user": {
"uid": str(uuid.uuid4()),
},
"audio": {
"voice_type": args.voice_type,
"encoding": args.encoding,
},
"request": {
"reqid": str(uuid.uuid4()),
"text": args.text,
"operation": "submit",
"with_timestamp": "1",
"extra_param": json.dumps(
{
"disable_markdown_filter": False,
}
),
},
}
# Send request
await full_client_request(websocket, json.dumps(request).encode())
# Receive audio data
audio_data = bytearray()
while True:
msg = await receive_message(websocket)
if msg.type == MsgType.FrontEndResultServer:
continue
elif msg.type == MsgType.AudioOnlyServer:
audio_data.extend(msg.payload)
if msg.sequence < 0: # Last message
break
else:
raise RuntimeError(f"TTS conversion failed: {msg}")
# Check if we received any audio data
if not audio_data:
raise RuntimeError("No audio data received")
# Save audio file
filename = f"{args.voice_type}.{args.encoding}"
with open(filename, "wb") as f:
f.write(audio_data)
logger.info(f"Audio received: {len(audio_data)}, saved to {filename}")
finally:
await websocket.close()
logger.info("Connection closed")
if __name__ == "__main__":
import asyncio
asyncio.run(main())