mirror of
https://github.com/osmarks/random-stuff
synced 2024-11-09 22:09:55 +00:00
93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
import wave, sys
|
|
import nltk
|
|
from balacoon_tts import TTS
|
|
from collections import namedtuple
|
|
import struct
|
|
from PIL import Image, ImageDraw
|
|
import cv2
|
|
import numpy
|
|
import subprocess
|
|
|
|
WIDTH = 400
|
|
|
|
AUDIO = "/tmp/x.wav"
|
|
VIDEO = "/tmp/x.avi"
|
|
BACKDROP = "/tmp/x.mkv"
|
|
OUTPUT = "/tmp/x.mp4"
|
|
|
|
def render_text(text: str):
|
|
render_params = {"font_size": 24}
|
|
im = Image.new("RGBA", (WIDTH, WIDTH))
|
|
draw = ImageDraw.Draw(im)
|
|
loc = [0, 0]
|
|
toks = text.split()
|
|
toks.reverse()
|
|
text_commands = []
|
|
while toks:
|
|
chunk = []
|
|
while draw.textbbox(loc, " ".join(chunk), **render_params)[2] < WIDTH:
|
|
if not toks: break
|
|
chunk.append(toks.pop())
|
|
else: toks.append(chunk.pop())
|
|
bbox = draw.textbbox(loc, " ".join(chunk), **render_params)
|
|
text_commands.append((tuple(loc), " ".join(chunk)))
|
|
loc[1] = bbox[3]
|
|
draw.rectangle([0, 0, WIDTH, loc[1]], fill="white")
|
|
for loc, text in text_commands:
|
|
draw.text(loc, text, fill="black", **render_params)
|
|
return im
|
|
|
|
Pause = namedtuple("Pause", ["length"])
|
|
|
|
text = open("/home/osmarks/Downloads/seq1.txt").read()
|
|
|
|
tts = TTS("/home/osmarks/Downloads/en_us_hifi_jets_cpu.addon")
|
|
#supported_speakers = tts.get_speakers()
|
|
speaker = "6670"
|
|
|
|
def chunks(text: str) -> list[str | Pause]:
|
|
out = []
|
|
for line in text.splitlines():
|
|
if line:
|
|
for sent in nltk.sent_tokenize(line):
|
|
out.append(sent)
|
|
out.append(Pause(0.5))
|
|
out.append(Pause(1))
|
|
return out
|
|
|
|
RATE = tts.get_sampling_rate()
|
|
FPS = 30
|
|
|
|
def wavblank(seconds):
|
|
return struct.pack(">h", 0) * round(seconds * RATE / 2)
|
|
|
|
blank_frame = render_text("")
|
|
|
|
fourcc = cv2.VideoWriter_fourcc(*"MJPG")
|
|
video_writer = cv2.VideoWriter(VIDEO, fourcc, FPS, (WIDTH, WIDTH))
|
|
total_dur = 0
|
|
with wave.open(AUDIO, "w") as fp:
|
|
fp.setparams((1, 2, RATE, 0, "NONE", "NONE"))
|
|
for chunk in chunks(text):
|
|
if isinstance(chunk, str):
|
|
samples = tts.synthesize(chunk, speaker)
|
|
image = render_text(chunk)
|
|
elif isinstance(chunk, Pause):
|
|
samples = wavblank(chunk.length)
|
|
image = blank_frame
|
|
fp.writeframes(samples)
|
|
duration = len(samples) / RATE # what
|
|
total_dur += duration
|
|
frame = cv2.cvtColor(numpy.array(image), cv2.COLOR_RGBA2BGR)
|
|
for _ in range(round(duration * FPS)): video_writer.write(frame)
|
|
print(chunk, duration)
|
|
video_writer.release()
|
|
subprocess.run([
|
|
"ffmpeg",
|
|
"-i", BACKDROP, "-i", AUDIO, "-i", VIDEO,
|
|
"-filter_complex", "overlay=x=200:y=200,format=nv12,hwupload",
|
|
"-to", str(total_dur),
|
|
"-y",
|
|
"-vaapi_device", "/dev/dri/renderD128", "-c:v", "h264_vaapi",
|
|
OUTPUT
|
|
]).check_returncode() |