2022-09-09 00:13:02 +03:00

68 lines
1.7 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import subprocess
import json
import textwrap
from webvtt import WebVTT, Caption
from vosk import Model, KaldiRecognizer, SetLogLevel
SAMPLE_RATE = 16000
WORDS_PER_LINE = 7
SetLogLevel(-1)
model = Model(lang="en-us")
rec = KaldiRecognizer(model, SAMPLE_RATE)
rec.SetWords(True)
def timestring(seconds):
minutes = seconds / 60
seconds = seconds % 60
hours = int(minutes / 60)
minutes = int(minutes % 60)
return "%i:%02i:%06.3f" % (hours, minutes, seconds)
def transcribe():
command = ["ffmpeg", "-nostdin", "-loglevel", "quiet", "-i", sys.argv[1],
"-ar", str(SAMPLE_RATE), "-ac", "1", "-f", "s16le", "-"]
with subprocess.Popen(command, stdout=subprocess.PIPE) as process:
results = []
while True:
data = process.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(rec.Result())
results.append(rec.FinalResult())
vtt = WebVTT()
for _, res in enumerate(results):
words = json.loads(res).get("result")
if not words:
continue
start = timestring(words[0]["start"])
end = timestring(words[-1]["end"])
content = " ".join([w["word"] for w in words])
caption = Caption(start, end, textwrap.fill(content))
vtt.captions.append(caption)
# save or return webvtt
if len(sys.argv) > 2:
vtt.save(sys.argv[2])
else:
print(vtt.content)
if __name__ == "__main__":
if not 1 < len(sys.argv) < 4:
print('Usage: {} audiofile [output file]'.format(sys.argv[0]))
sys.exit(1)
transcribe()