mirror of
https://github.com/alphacep/vosk-api.git
synced 2026-01-09 07:32:34 +08:00
Compare commits
27 Commits
go/v0.3.46
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
625e44c626 | ||
|
|
488fc44d71 | ||
|
|
a428d65966 | ||
|
|
056a2ad548 | ||
|
|
07a0ddb467 | ||
|
|
24f0e0cff8 | ||
|
|
0f364e3a44 | ||
|
|
eabd80a848 | ||
|
|
780ba2f0b7 | ||
|
|
47509f7f9c | ||
|
|
4bf3370826 | ||
|
|
cf67ed6cd9 | ||
|
|
0979c46766 | ||
|
|
eeab22ed98 | ||
|
|
a9f27eb11d | ||
|
|
1b308a3017 | ||
|
|
f5540085b5 | ||
|
|
c64c3daa3b | ||
|
|
cc48ff9567 | ||
|
|
7358c799b1 | ||
|
|
a7bf6a51e2 | ||
|
|
72797111db | ||
|
|
40937b6bcb | ||
|
|
7da70c6107 | ||
|
|
2426225d74 | ||
|
|
c4d32a2293 | ||
|
|
6f7fe0e417 |
@ -10,6 +10,7 @@ add_library(vosk
|
||||
src/recognizer.cc
|
||||
src/spk_model.cc
|
||||
src/vosk_api.cc
|
||||
src/postprocessor.cc
|
||||
)
|
||||
|
||||
find_package(kaldi REQUIRED)
|
||||
|
||||
@ -4,13 +4,13 @@ buildscript {
|
||||
mavenCentral()
|
||||
}
|
||||
dependencies {
|
||||
classpath 'com.android.tools.build:gradle:7.4.0'
|
||||
classpath 'com.vanniktech:gradle-maven-publish-plugin:0.24.0'
|
||||
classpath 'com.android.tools.build:gradle:8.13.0'
|
||||
classpath 'com.vanniktech:gradle-maven-publish-plugin:0.34.0'
|
||||
}
|
||||
}
|
||||
|
||||
allprojects {
|
||||
version = '0.3.47'
|
||||
version = '0.3.75'
|
||||
}
|
||||
|
||||
subprojects {
|
||||
@ -24,7 +24,7 @@ subprojects {
|
||||
}
|
||||
|
||||
mavenPublishing {
|
||||
publishToMavenCentral(com.vanniktech.maven.publish.SonatypeHost.S01, false)
|
||||
publishToMavenCentral()
|
||||
signAllPublications()
|
||||
}
|
||||
|
||||
|
||||
@ -29,7 +29,7 @@ set -x
|
||||
OS_NAME=`echo $(uname -s) | tr '[:upper:]' '[:lower:]'`
|
||||
ANDROID_TOOLCHAIN_PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64
|
||||
WORKDIR_BASE=`pwd`/build
|
||||
PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64/bin:$PATH
|
||||
PATH=$ANDROID_TOOLCHAIN_PATH/bin:$PATH
|
||||
OPENFST_VERSION=1.8.0
|
||||
|
||||
for arch in armeabi-v7a arm64-v8a x86_64 x86; do
|
||||
@ -45,6 +45,7 @@ case $arch in
|
||||
CC=armv7a-linux-androideabi21-clang
|
||||
CXX=armv7a-linux-androideabi21-clang++
|
||||
ARCHFLAGS="-mfloat-abi=softfp -mfpu=neon"
|
||||
PAGESIZE_LDFLAGS=""
|
||||
;;
|
||||
arm64-v8a)
|
||||
BLAS_ARCH=ARMV8
|
||||
@ -54,6 +55,8 @@ case $arch in
|
||||
CC=aarch64-linux-android21-clang
|
||||
CXX=aarch64-linux-android21-clang++
|
||||
ARCHFLAGS=""
|
||||
# Ensure compatibility with 16KiB page size devices
|
||||
PAGESIZE_LDFLAGS="-Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=16384"
|
||||
;;
|
||||
x86_64)
|
||||
BLAS_ARCH=ATOM
|
||||
@ -63,6 +66,7 @@ case $arch in
|
||||
CC=x86_64-linux-android21-clang
|
||||
CXX=x86_64-linux-android21-clang++
|
||||
ARCHFLAGS=""
|
||||
PAGESIZE_LDFLAGS=""
|
||||
;;
|
||||
x86)
|
||||
BLAS_ARCH=ATOM
|
||||
@ -72,6 +76,7 @@ case $arch in
|
||||
CC=i686-linux-android21-clang
|
||||
CXX=i686-linux-android21-clang++
|
||||
ARCHFLAGS=""
|
||||
PAGESIZE_LDFLAGS=""
|
||||
;;
|
||||
esac
|
||||
|
||||
@ -79,16 +84,16 @@ mkdir -p $WORKDIR/local/lib
|
||||
|
||||
# openblas first
|
||||
cd $WORKDIR
|
||||
git clone -b v0.3.13 --single-branch https://github.com/xianyi/OpenBLAS
|
||||
make -C OpenBLAS TARGET=$BLAS_ARCH ONLY_CBLAS=1 AR=$AR CC=$CC HOSTCC=gcc ARM_SOFTFP_ABI=1 USE_THREAD=0 NUM_THREADS=1 -j4
|
||||
git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS
|
||||
make -C OpenBLAS TARGET=$BLAS_ARCH ONLY_CBLAS=1 AR=$AR CC=$CC HOSTCC=gcc ARM_SOFTFP_ABI=1 USE_THREAD=0 NUM_THREADS=1 -j 8
|
||||
make -C OpenBLAS install PREFIX=$WORKDIR/local
|
||||
|
||||
# CLAPACK
|
||||
cd $WORKDIR
|
||||
git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack
|
||||
mkdir -p clapack/BUILD && cd clapack/BUILD
|
||||
cmake -DCMAKE_C_FLAGS=$ARCHFLAGS -DCMAKE_C_COMPILER_TARGET=$HOST \
|
||||
-DCMAKE_C_COMPILER=$CC -DCMAKE_SYSTEM_NAME=Generic -DCMAKE_AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64/bin/$AR \
|
||||
cmake -DCMAKE_C_FLAGS="$ARCHFLAGS" -DCMAKE_C_COMPILER_TARGET=$HOST \
|
||||
-DCMAKE_C_COMPILER=$CC -DCMAKE_SYSTEM_NAME=Generic -DCMAKE_AR=$ANDROID_TOOLCHAIN_PATH/bin/$AR \
|
||||
-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
|
||||
-DCMAKE_CROSSCOMPILING=True ..
|
||||
make -j 8 -C F2CLIBS/libf2c
|
||||
@ -118,7 +123,7 @@ CXX=$CXX AR=$AR RANLIB=$RANLIB CXXFLAGS="$ARCHFLAGS -O3 -DFST_NO_DYNAMIC_LINKING
|
||||
--fst-root=${WORKDIR}/local --fst-version=${OPENFST_VERSION}
|
||||
make -j 8 depend
|
||||
cd $WORKDIR/kaldi/src
|
||||
make -j 8 online2 lm rnnlm
|
||||
make -j 8 online2 rnnlm
|
||||
|
||||
# Vosk-api
|
||||
cd $WORKDIR
|
||||
@ -129,7 +134,7 @@ make -j 8 -C ${WORKDIR_BASE}/../../../src \
|
||||
OPENFST_ROOT=${WORKDIR}/local \
|
||||
OPENBLAS_ROOT=${WORKDIR}/local \
|
||||
CXX=$CXX \
|
||||
EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so"
|
||||
EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so ${PAGESIZE_LDFLAGS}"
|
||||
cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so
|
||||
|
||||
done
|
||||
|
||||
@ -4,14 +4,14 @@ def pomDescription = "Vosk speech recognition library for Android"
|
||||
|
||||
android {
|
||||
namespace 'org.vosk'
|
||||
compileSdkVersion 33
|
||||
compileSdkVersion 36
|
||||
defaultConfig {
|
||||
minSdkVersion 21
|
||||
targetSdkVersion 33
|
||||
targetSdkVersion 36
|
||||
versionCode 10
|
||||
versionName = version
|
||||
archivesBaseName = archiveName
|
||||
ndkVersion = "25.2.9519653"
|
||||
ndkVersion = "28.2.13676358"
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility JavaVersion.VERSION_1_8
|
||||
@ -25,7 +25,7 @@ task buildVosk(type: Exec) {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api 'net.java.dev.jna:jna:5.13.0@aar'
|
||||
api 'net.java.dev.jna:jna:5.18.1@aar'
|
||||
}
|
||||
|
||||
//preBuild.dependsOn buildVosk
|
||||
|
||||
@ -56,8 +56,18 @@ public class LibVosk {
|
||||
|
||||
public static native void vosk_recognizer_reset(Pointer recognizer);
|
||||
|
||||
public static native void vosk_recognizer_set_endpointer_mode(Pointer recognizer, int mode);
|
||||
|
||||
public static native void vosk_recognizer_set_endpointer_delays(Pointer recognizer, float t_start_max, float t_end, float t_max);
|
||||
|
||||
public static native void vosk_recognizer_free(Pointer recognizer);
|
||||
|
||||
public static native Pointer vosk_text_processor_new(String verbalizer, String tagger);
|
||||
|
||||
public static native void vosk_text_processor_free(Pointer processor);
|
||||
|
||||
public static native String vosk_text_processor_itn(Pointer processor, String input);
|
||||
|
||||
/**
|
||||
* Set log level for Kaldi messages.
|
||||
*
|
||||
|
||||
@ -236,6 +236,34 @@ public class Recognizer extends PointerType implements AutoCloseable {
|
||||
LibVosk.vosk_recognizer_reset(this.getPointer());
|
||||
}
|
||||
|
||||
/**
|
||||
* Endpointer delay mode
|
||||
*/
|
||||
public class EndpointerMode {
|
||||
public static final int DEFAULT = 0;
|
||||
public static final int SHORT = 1;
|
||||
public static final int LONG = 2;
|
||||
public static final int VERY_LONG = 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configures endpointer mode for recognizer
|
||||
*/
|
||||
public void setEndpointerMode(int mode) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_mode(this.getPointer(), mode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set endpointer delays
|
||||
*
|
||||
* @param t_start_max timeout for stopping recognition in case of initial silence (usually around 5.0)
|
||||
* @param t_end timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
|
||||
* @param t_max timeout for forcing utterance end in milliseconds (usually around 20-30)
|
||||
**/
|
||||
public void setEndpointerDelays(float t_start_max, float t_end, float t_max) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_delays(this.getPointer(), t_start_max, t_end, t_max);
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases recognizer object.
|
||||
* Underlying model is also unreferenced and if needed, released.
|
||||
|
||||
21
android/lib/src/main/java/org/vosk/TextProcessor.java
Normal file
21
android/lib/src/main/java/org/vosk/TextProcessor.java
Normal file
@ -0,0 +1,21 @@
|
||||
package org.vosk;
|
||||
|
||||
import com.sun.jna.PointerType;
|
||||
|
||||
public class TextProcessor extends PointerType implements AutoCloseable {
|
||||
public TextProcessor() {
|
||||
}
|
||||
|
||||
public TextProcessor(String verbalizer, String tagger) {
|
||||
super(LibVosk.vosk_text_processor_new(verbalizer, tagger));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
LibVosk.vosk_text_processor_free(this.getPointer());
|
||||
}
|
||||
|
||||
public String itn(String input) {
|
||||
return LibVosk.vosk_text_processor_itn(this.getPointer(), input);
|
||||
}
|
||||
}
|
||||
@ -4,10 +4,10 @@ def pomDescription = "Small English model for Android"
|
||||
|
||||
android {
|
||||
namespace "org.vosk"
|
||||
compileSdkVersion 33
|
||||
compileSdkVersion 36
|
||||
defaultConfig {
|
||||
minSdkVersion 21
|
||||
targetSdkVersion 33
|
||||
targetSdkVersion 36
|
||||
versionCode 10
|
||||
versionName = version
|
||||
archivesBaseName = archiveName
|
||||
|
||||
@ -28,6 +28,9 @@ public class VoskDemo
|
||||
{
|
||||
// Demo float array
|
||||
VoskRecognizer rec = new VoskRecognizer(model, 16000.0f);
|
||||
|
||||
rec.SetEndpointerMode(EndpointerMode.LONG);
|
||||
|
||||
using(Stream source = File.OpenRead("test.wav")) {
|
||||
byte[] buffer = new byte[4096];
|
||||
int bytesRead;
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net5.0</TargetFramework>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<RootNamespace>VoskDemo</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Vosk" Version="0.3.45" />
|
||||
<PackageReference Include="Vosk" Version="0.3.75" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
17
csharp/nuget/Vosk.csproj
Normal file
17
csharp/nuget/Vosk.csproj
Normal file
@ -0,0 +1,17 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<PackageId>Vosk</PackageId>
|
||||
<Version>0.3.75</Version>
|
||||
<authors>Alpha Cephei Inc</authors>
|
||||
<owners>Alpha Cephei Inc</owners>
|
||||
</PropertyGroup>
|
||||
|
||||
<Target Name="CopyFiles" AfterTargets="Build">
|
||||
<Copy SourceFiles="bin/Release/net8.0/Vosk.dll" DestinationFolder="lib/net8.0" />
|
||||
</Target>
|
||||
|
||||
</Project>
|
||||
@ -2,7 +2,7 @@
|
||||
<package>
|
||||
<metadata>
|
||||
<id>Vosk</id>
|
||||
<version>0.3.45</version>
|
||||
<version>0.3.75</version>
|
||||
<authors>Alpha Cephei Inc</authors>
|
||||
<owners>Alpha Cephei Inc</owners>
|
||||
<license type="expression">Apache-2.0</license>
|
||||
@ -23,10 +23,10 @@ Vosk scales from small devices like Raspberry Pi or Android smartphone to big cl
|
||||
<copyright>Copyright 2020-2050 Alpha Cephei Inc</copyright>
|
||||
<tags>speech recognition voice stt asr speech-to-text ai offline privacy</tags>
|
||||
<dependencies>
|
||||
<group targetFramework=".NETStandard2.0"/>
|
||||
<group targetFramework="net8.0"/>
|
||||
</dependencies>
|
||||
</metadata>
|
||||
<files>
|
||||
<file src="**" exclude="src/*.cs;build.sh;**/.keep-me;*.nupkg" />
|
||||
<file src="**" exclude="bin/**;obj/**;build.sh;src/*.cs;*.nupkg;**/.keep-me" />
|
||||
</files>
|
||||
</package>
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
mcs -out:lib/netstandard2.0/Vosk.dll -target:library src/*.cs
|
||||
nuget pack
|
||||
rm -rf bin lib obj
|
||||
/home/shmyrev/local/dotnet/dotnet pack Vosk.csproj -p:NuspecFile=Vosk.nuspec -o .
|
||||
|
||||
@ -65,6 +65,12 @@ class VoskPINVOKE {
|
||||
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_reset")]
|
||||
public static extern void VoskRecognizer_Reset(global::System.Runtime.InteropServices.HandleRef jarg1);
|
||||
|
||||
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_set_endpointer_mode")]
|
||||
public static extern void VoskRecognizer_SetEndpointerMode(global::System.Runtime.InteropServices.HandleRef jarg1, int jarg2);
|
||||
|
||||
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_set_endpointer_delays")]
|
||||
public static extern void VoskRecognizer_SetEndpointerDelays(global::System.Runtime.InteropServices.HandleRef jarg1, float jarg2, float jarg3, float jarg4);
|
||||
|
||||
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_set_log_level")]
|
||||
public static extern void SetLogLevel(int jarg1);
|
||||
|
||||
@ -107,7 +113,6 @@ class VoskPINVOKE {
|
||||
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint = "vosk_batch_recognizer_get_pending_chunks")]
|
||||
public static extern int VoskBatchRecognizer_GetPendingChunks(global::System.Runtime.InteropServices.HandleRef jarg1);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,12 @@
|
||||
namespace Vosk {
|
||||
|
||||
public enum EndpointerMode {
|
||||
DEFAULT = 0,
|
||||
SHORT = 1,
|
||||
LONG = 2,
|
||||
VERY_LONG = 3
|
||||
}
|
||||
|
||||
public class VoskRecognizer : System.IDisposable {
|
||||
private System.Runtime.InteropServices.HandleRef handle;
|
||||
|
||||
@ -91,6 +98,14 @@ public class VoskRecognizer : System.IDisposable {
|
||||
VoskPINVOKE.VoskRecognizer_Reset(handle);
|
||||
}
|
||||
|
||||
public void SetEndpointerMode(EndpointerMode mode) {
|
||||
VoskPINVOKE.VoskRecognizer_SetEndpointerMode(handle, (int) mode);
|
||||
}
|
||||
|
||||
public void SetEndpointerDelays(float t_start_max, float t_end, float t_max) {
|
||||
VoskPINVOKE.VoskRecognizer_SetEndpointerDelays(handle, t_start_max, t_end, t_max);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
99
go/batch.go
Normal file
99
go/batch.go
Normal file
@ -0,0 +1,99 @@
|
||||
package vosk
|
||||
|
||||
// #cgo CPPFLAGS: -I ${SRCDIR}/../src
|
||||
// #cgo !windows LDFLAGS: -L ${SRCDIR}/../src -lvosk -ldl -lpthread
|
||||
// #cgo windows LDFLAGS: -L ${SRCDIR}/../src -lvosk -lpthread
|
||||
// #include <stdlib.h>
|
||||
// #include <vosk_api.h>
|
||||
import "C"
|
||||
import "unsafe"
|
||||
|
||||
// VoskBatchModel contains a reference to the C VoskBatchModel
|
||||
type VoskBatchModel struct {
|
||||
model *C.struct_VoskBatchModel
|
||||
}
|
||||
|
||||
// NewBatchModel creates a new VoskBatchModel instance
|
||||
func NewBatchModel(modelPath string) (*VoskBatchModel, error) {
|
||||
cmodelPath := C.CString(modelPath)
|
||||
defer C.free(unsafe.Pointer(cmodelPath))
|
||||
internal := C.vosk_batch_model_new(cmodelPath)
|
||||
model := &VoskBatchModel{model: internal}
|
||||
return model, nil
|
||||
}
|
||||
|
||||
func (m *VoskBatchModel) Free() {
|
||||
C.vosk_batch_model_free(m.model)
|
||||
}
|
||||
|
||||
func (m *VoskBatchModel) Wait() {
|
||||
C.vosk_batch_model_wait(m.model);
|
||||
}
|
||||
|
||||
func freeBatchModel(model *VoskBatchModel) {
|
||||
C.vosk_batch_model_free(model.model)
|
||||
}
|
||||
|
||||
// VoskBatchRecognizer contains a reference to the C VoskBatchRecognizer
|
||||
type VoskBatchRecognizer struct {
|
||||
rec *C.struct_VoskBatchRecognizer
|
||||
}
|
||||
|
||||
func freeBatchRecognizer(recognizer *VoskBatchRecognizer) {
|
||||
C.vosk_batch_recognizer_free(recognizer.rec)
|
||||
}
|
||||
|
||||
func (r *VoskBatchRecognizer) Free() {
|
||||
C.vosk_batch_recognizer_free(r.rec)
|
||||
}
|
||||
|
||||
// NewBatchRecognizer creates a new VoskBatchRecognizer instance
|
||||
func NewBatchRecognizer(model *VoskBatchModel, sampleRate float64) (*VoskBatchRecognizer, error) {
|
||||
internal := C.vosk_batch_recognizer_new(model.model, C.float(sampleRate))
|
||||
rec := &VoskBatchRecognizer{rec: internal}
|
||||
return rec, nil
|
||||
}
|
||||
|
||||
// AcceptWaveform accepts and processes a new chunk of the voice data.
|
||||
func (r *VoskBatchRecognizer) AcceptWaveform(buffer []byte) {
|
||||
cbuf := C.CBytes(buffer)
|
||||
defer C.free(cbuf)
|
||||
C.vosk_batch_recognizer_accept_waveform(r.rec, (*C.char)(cbuf), C.int(len(buffer)))
|
||||
}
|
||||
|
||||
/** Set NLSML output
|
||||
* @param nlsml - boolean value
|
||||
*/
|
||||
//void vosk_batch_recognizer_set_nlsml(VoskBatchRecognizer *recognizer, int nlsml);
|
||||
|
||||
func (r *VoskBatchRecognizer) SetNlsml(nlsml int) {
|
||||
C.vosk_batch_recognizer_set_nlsml(r.rec, C.int(nlsml))
|
||||
}
|
||||
|
||||
/** Closes the stream */
|
||||
//void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer);
|
||||
|
||||
func (r *VoskBatchRecognizer) FinishStream() {
|
||||
C.vosk_batch_recognizer_finish_stream(r.rec)
|
||||
}
|
||||
|
||||
/** Return results */
|
||||
//const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer);
|
||||
|
||||
func (r *VoskBatchRecognizer) FrontResult() string {
|
||||
return C.GoString(C.vosk_batch_recognizer_front_result(r.rec))
|
||||
}
|
||||
|
||||
/** Release and free first retrieved result */
|
||||
//void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
|
||||
|
||||
func (r *VoskBatchRecognizer) Pop() {
|
||||
C.vosk_batch_recognizer_pop(r.rec)
|
||||
}
|
||||
|
||||
/** Get amount of pending chunks for more intelligent waiting */
|
||||
//int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
|
||||
func (r *VoskBatchRecognizer) GetPendingChunks() int {
|
||||
i := C.vosk_batch_recognizer_get_pending_chunks(r.rec)
|
||||
return int(i)
|
||||
}
|
||||
5
go/batch_example/README.md
Normal file
5
go/batch_example/README.md
Normal file
@ -0,0 +1,5 @@
|
||||
This example expects a `s16le` converted audio file and converts it to text in a
|
||||
manner that imitates the Python example of [test_gpu_batch.py](../python/example/test_gpu_batch.py).
|
||||
|
||||
Note that the `libvosk.so` must be in the library path. This was successfully tested on
|
||||
Ubuntu 24.04 with Go 1.18, gcc-11, NVIDIA driver 570.172.08.
|
||||
54
go/batch_example/test_batch.go
Normal file
54
go/batch_example/test_batch.go
Normal file
@ -0,0 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
vosk "github.com/alphacep/vosk-api/go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var filename string
|
||||
flag.StringVar(&filename, "f", "", "file to transcribe")
|
||||
flag.Parse()
|
||||
|
||||
vosk.GPUInit()
|
||||
|
||||
model, err := vosk.NewBatchModel("model")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
rec, err := vosk.NewBatchRecognizer(model, 16000.0)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
buf := make([]byte, 8000)
|
||||
|
||||
for {
|
||||
if _, err := file.Read(buf); err != nil {
|
||||
if err != io.EOF {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
rec.AcceptWaveform(buf)
|
||||
model.Wait()
|
||||
if rec.FrontResult() != "" {
|
||||
fmt.Println(rec.FrontResult())
|
||||
rec.Pop()
|
||||
}
|
||||
}
|
||||
// Is this needed? rec.FinishStream()
|
||||
}
|
||||
@ -16,6 +16,8 @@ func main() {
|
||||
flag.StringVar(&filename, "f", "", "file to transcribe")
|
||||
flag.Parse()
|
||||
|
||||
vosk.GPUInit()
|
||||
|
||||
model, err := vosk.NewModel("model")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
|
||||
10
go/vosk.go
10
go/vosk.go
@ -125,6 +125,16 @@ func (r *VoskRecognizer) SetPartialWords(words int) {
|
||||
C.vosk_recognizer_set_partial_words(r.rec, C.int(words))
|
||||
}
|
||||
|
||||
// SetEndpointerDelays sets the recognition timeouts, where startMax
|
||||
// is the timeout for stopping recognition in case of initial silence
|
||||
// (usually around 5), end is the timeout for stopping recognition
|
||||
// in milliseconds after we recognized something (usually around 0.5-1.0),
|
||||
// and max is the timeout for forcing utterance end in milliseconds
|
||||
// (usually around 20-30).
|
||||
func (r *VoskRecognizer) SetEndpointerDelays(startMax, end, max float64) {
|
||||
C.vosk_recognizer_set_endpointer_delays(r.rec, C.float(startMax), C.float(end), C.float(max))
|
||||
}
|
||||
|
||||
// AcceptWaveform accepts and processes a new chunk of the voice data.
|
||||
func (r *VoskRecognizer) AcceptWaveform(buffer []byte) int {
|
||||
cbuf := C.CBytes(buffer)
|
||||
|
||||
@ -11,5 +11,5 @@ repositories {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation group: 'com.alphacephei', name: 'vosk', version: '0.3.45'
|
||||
implementation group: 'com.alphacephei', name: 'vosk', version: '0.3.75'
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@ repositories {
|
||||
|
||||
archivesBaseName = 'vosk'
|
||||
group = 'com.alphacephei'
|
||||
version = '0.3.45'
|
||||
version = '0.3.75'
|
||||
|
||||
mavenPublish {
|
||||
group = 'com.alphacephei'
|
||||
@ -25,7 +25,7 @@ mavenPublish {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api group: 'net.java.dev.jna', name: 'jna', version: '5.13.0'
|
||||
api group: 'net.java.dev.jna', name: 'jna', version: '5.18.1'
|
||||
testImplementation 'junit:junit:4.13'
|
||||
}
|
||||
|
||||
|
||||
@ -82,8 +82,18 @@ public class LibVosk {
|
||||
|
||||
public static native void vosk_recognizer_reset(Pointer recognizer);
|
||||
|
||||
public static native void vosk_recognizer_set_endpointer_mode(Pointer recognizer, int mode);
|
||||
|
||||
public static native void vosk_recognizer_set_endpointer_delays(Pointer recognizer, float t_start_max, float t_end, float t_max);
|
||||
|
||||
public static native void vosk_recognizer_free(Pointer recognizer);
|
||||
|
||||
public static native Pointer vosk_text_processor_new(String verbalizer, String tagger);
|
||||
|
||||
public static native void vosk_text_processor_free(Pointer processor);
|
||||
|
||||
public static native String vosk_text_processor_itn(Pointer processor, String input);
|
||||
|
||||
/**
|
||||
* Set log level for Kaldi messages.
|
||||
*
|
||||
|
||||
@ -236,6 +236,34 @@ public class Recognizer extends PointerType implements AutoCloseable {
|
||||
LibVosk.vosk_recognizer_reset(this.getPointer());
|
||||
}
|
||||
|
||||
/**
|
||||
* Endpointer delay mode
|
||||
*/
|
||||
public class EndpointerMode {
|
||||
public static final int DEFAULT = 0;
|
||||
public static final int SHORT = 1;
|
||||
public static final int LONG = 2;
|
||||
public static final int VERY_LONG = 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configures endpointer mode for recognizer
|
||||
*/
|
||||
public void setEndpointerMode(int mode) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_mode(this.getPointer(), mode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set endpointer delays
|
||||
*
|
||||
* @param t_start_max timeout for stopping recognition in case of initial silence (usually around 5.0)
|
||||
* @param t_end timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
|
||||
* @param t_max timeout for forcing utterance end in milliseconds (usually around 20-30)
|
||||
**/
|
||||
public void setEndpointerDelays(float t_start_max, float t_end, float t_max) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_delays(this.getPointer(), t_start_max, t_end, t_max);
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases recognizer object.
|
||||
* Underlying model is also unreferenced and if needed, released.
|
||||
|
||||
@ -15,8 +15,10 @@ import javax.sound.sampled.UnsupportedAudioFileException;
|
||||
|
||||
import org.vosk.LogLevel;
|
||||
import org.vosk.Recognizer;
|
||||
import org.vosk.Recognizer.EndpointerMode;
|
||||
import org.vosk.LibVosk;
|
||||
import org.vosk.Model;
|
||||
import org.vosk.TextProcessor;
|
||||
|
||||
public class DecoderTest {
|
||||
|
||||
@ -95,9 +97,24 @@ public class DecoderTest {
|
||||
Assert.assertTrue(true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decoderEndpointerDelays() throws IOException, UnsupportedAudioFileException {
|
||||
try (Model model = new Model("model");
|
||||
Recognizer recognizer = new Recognizer(model, 16000)) {
|
||||
recognizer.setEndpointerMode(EndpointerMode.VERY_LONG);
|
||||
recognizer.setEndpointerDelays(5.0f, 3.0f, 50.0f);
|
||||
}
|
||||
Assert.assertTrue(true);
|
||||
}
|
||||
|
||||
@Test(expected = IOException.class)
|
||||
public void decoderTestException() throws IOException {
|
||||
Model model = new Model("model_missing");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testItn() throws IOException {
|
||||
TextProcessor p = new TextProcessor("model/itn/en_itn_tagger.fst", "model/itn/en_itn_verbalizer.fst");
|
||||
System.out.println(p.itn("as easy as one two three"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import org.jetbrains.dokka.gradle.DokkaTask
|
||||
import org.jetbrains.kotlin.config.JvmTarget
|
||||
import org.jetbrains.kotlin.gradle.ExperimentalKotlinGradlePluginApi
|
||||
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
|
||||
|
||||
/*
|
||||
* Copyright 2020 Alpha Cephei Inc. & Doomsdayrs
|
||||
@ -18,15 +19,15 @@ import org.jetbrains.kotlin.config.JvmTarget
|
||||
*/
|
||||
|
||||
plugins {
|
||||
kotlin("multiplatform") version "1.8.10"
|
||||
kotlin("multiplatform") version "2.0.0"
|
||||
id("com.android.library")
|
||||
`maven-publish`
|
||||
id("org.jetbrains.dokka") version "1.7.20"
|
||||
kotlin("plugin.serialization") version "1.8.10"
|
||||
id("org.jetbrains.dokka") version "1.9.20"
|
||||
kotlin("plugin.serialization") version "2.0.0"
|
||||
}
|
||||
|
||||
group = "com.alphacephei"
|
||||
version = "0.4.0-alpha0"
|
||||
version = "0.3.75"
|
||||
|
||||
repositories {
|
||||
google()
|
||||
@ -67,9 +68,11 @@ fun org.jetbrains.kotlin.gradle.dsl.KotlinMultiplatformExtension.native(
|
||||
|
||||
kotlin {
|
||||
jvm {
|
||||
compilations.all {
|
||||
kotlinOptions.jvmTarget = JvmTarget.JVM_11.description
|
||||
@OptIn(ExperimentalKotlinGradlePluginApi::class)
|
||||
compilerOptions {
|
||||
jvmTarget.set(JvmTarget.JVM_17)
|
||||
}
|
||||
|
||||
testRuns["test"].executionTask.configure {
|
||||
useJUnitPlatform()
|
||||
environment("MODEL", "VOSK_MODEL")
|
||||
@ -80,7 +83,7 @@ kotlin {
|
||||
}
|
||||
}
|
||||
|
||||
android {
|
||||
androidTarget {
|
||||
publishAllLibraryVariants()
|
||||
}
|
||||
|
||||
@ -102,6 +105,16 @@ kotlin {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@OptIn(ExperimentalKotlinGradlePluginApi::class)
|
||||
applyDefaultHierarchyTemplate {
|
||||
withJvm()
|
||||
withAndroidTarget()
|
||||
|
||||
if (enableNative)
|
||||
withNative()
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
withType<MavenPublication> {
|
||||
@ -130,13 +143,13 @@ kotlin {
|
||||
}
|
||||
}
|
||||
|
||||
val jna_version = "5.13.0"
|
||||
val coroutines_version = "1.6.4"
|
||||
val jna_version = "5.14.0"
|
||||
val coroutines_version = "1.7.3"
|
||||
|
||||
sourceSets {
|
||||
val commonMain by getting {
|
||||
dependencies {
|
||||
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.4.1")
|
||||
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.7.0")
|
||||
api("org.jetbrains.kotlinx:kotlinx-coroutines-core:$coroutines_version")
|
||||
}
|
||||
}
|
||||
@ -161,7 +174,7 @@ kotlin {
|
||||
api("net.java.dev.jna:jna:$jna_version@aar")
|
||||
}
|
||||
}
|
||||
val androidTest by getting {
|
||||
val androidUnitTest by getting {
|
||||
dependencies {
|
||||
implementation("junit:junit:4.13.2")
|
||||
}
|
||||
@ -170,15 +183,16 @@ kotlin {
|
||||
}
|
||||
|
||||
android {
|
||||
compileSdk = 33
|
||||
namespace = "com.alphacephei.library"
|
||||
compileSdk = 34
|
||||
sourceSets["main"].manifest.srcFile("src/androidMain/AndroidManifest.xml")
|
||||
defaultConfig {
|
||||
minSdk = 24
|
||||
targetSdk = 33
|
||||
targetSdk = 34
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_11
|
||||
targetCompatibility = JavaVersion.VERSION_11
|
||||
sourceCompatibility = JavaVersion.VERSION_17
|
||||
targetCompatibility = JavaVersion.VERSION_17
|
||||
}
|
||||
publishing {
|
||||
multipleVariants {
|
||||
|
||||
@ -23,7 +23,7 @@ pluginManagement {
|
||||
resolutionStrategy {
|
||||
eachPlugin {
|
||||
if (requested.id.namespace == "com.android") {
|
||||
useModule("com.android.tools.build:gradle:7.3.0")
|
||||
useModule("com.android.tools.build:gradle:8.3.0")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
27
kotlin/src/commonMain/kotlin/org/vosk/EndPointerMode.kt
Normal file
27
kotlin/src/commonMain/kotlin/org/vosk/EndPointerMode.kt
Normal file
@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright 2024 Alpha Cephei Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.vosk
|
||||
|
||||
/**
|
||||
* VoskEpMode
|
||||
*/
|
||||
enum class EndPointerMode {
|
||||
ANSWER_DEFAULT,
|
||||
ANSWER_SHORT,
|
||||
ANSWER_LONG,
|
||||
ANSWER_VERY_LONG
|
||||
}
|
||||
@ -151,6 +151,7 @@ expect class Recognizer : Freeable {
|
||||
* }],
|
||||
* </pre>
|
||||
*
|
||||
* C equivalent = vosk_recognizer_set_words
|
||||
* @param words - boolean value
|
||||
*/
|
||||
fun setOutputWordTimes(words: Boolean)
|
||||
@ -168,6 +169,23 @@ expect class Recognizer : Freeable {
|
||||
*/
|
||||
fun setNLSML(nlsml: Boolean)
|
||||
|
||||
|
||||
/**
|
||||
* Set endpointer scaling factor
|
||||
*
|
||||
* @param mode Endpointer mode
|
||||
**/
|
||||
fun setEndPointerMode(mode: EndPointerMode)
|
||||
|
||||
/**
|
||||
* Set endpointer delays
|
||||
*
|
||||
* @param tStartMax timeout for stopping recognition in case of initial silence (usually around 5.0)
|
||||
* @param tEnd timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
|
||||
* @param tMax timeout for forcing utterance end in milliseconds (usually around 20-30)
|
||||
**/
|
||||
fun setEndPointerDelays(tStartMax: Float, tEnd: Float, tMax: Float)
|
||||
|
||||
/**
|
||||
* Accept voice data
|
||||
*
|
||||
|
||||
32
kotlin/src/commonMain/kotlin/org/vosk/TextProcessor.kt
Normal file
32
kotlin/src/commonMain/kotlin/org/vosk/TextProcessor.kt
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright 2024 Alpha Cephei Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.vosk
|
||||
|
||||
/**
|
||||
* Inverse text normalization
|
||||
*
|
||||
* @since 2024/06/19
|
||||
* @constructor Create text processor
|
||||
*/
|
||||
expect class TextProcessor constructor(tagger: Char, verbalizer: Char) : Freeable {
|
||||
|
||||
/** Release text processor */
|
||||
override fun free()
|
||||
|
||||
/** Convert string */
|
||||
fun itn(input: Char): Char
|
||||
}
|
||||
@ -19,4 +19,4 @@ package org.vosk.exception
|
||||
/**
|
||||
* Internal common IO exception. On JVM this is just a type alias.
|
||||
*/
|
||||
expect open class IOException(message: String? = null) : Exception
|
||||
expect open class IOException(message: String?) : Exception
|
||||
@ -42,14 +42,13 @@ internal object LibVosk {
|
||||
|
||||
@Throws(IOException::class)
|
||||
private fun unpackDll(targetDir: File, lib: String) {
|
||||
val source: InputStream =
|
||||
Vosk::class.java.getResourceAsStream("/win32-x86-64/$lib.dll")!!
|
||||
|
||||
Files.copy(
|
||||
source,
|
||||
File(targetDir, "$lib.dll").toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING
|
||||
)
|
||||
Vosk::class.java.getResourceAsStream("/win32-x86-64/$lib.dll")!!.use {
|
||||
Files.copy(
|
||||
it,
|
||||
File(targetDir, "$lib.dll").toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
init {
|
||||
@ -57,6 +56,7 @@ internal object LibVosk {
|
||||
Platform.isAndroid() -> {
|
||||
Native.register(LibVosk::class.java, "vosk")
|
||||
}
|
||||
|
||||
Platform.isWindows() -> {
|
||||
// We have to unpack dependencies
|
||||
try {
|
||||
@ -79,6 +79,7 @@ internal object LibVosk {
|
||||
Native.register(LibVosk::class.java, "libvosk");
|
||||
}
|
||||
}
|
||||
|
||||
else -> {
|
||||
Native.register(LibVosk::class.java, "vosk");
|
||||
}
|
||||
@ -194,4 +195,19 @@ internal object LibVosk {
|
||||
external fun vosk_batch_recognizer_pop(recognizer: BatchRecognizer)
|
||||
|
||||
external fun vosk_batch_recognizer_get_pending_chunks(recognizer: BatchRecognizer): Int
|
||||
|
||||
external fun vosk_text_processor_new(tagger: Char, verbalizer: Char): Pointer
|
||||
|
||||
external fun vosk_text_processor_free(processor: TextProcessor)
|
||||
|
||||
external fun vosk_text_processor_itn(processor: TextProcessor, input: Char): Char
|
||||
|
||||
external fun vosk_recognizer_set_endpointer_mode(recognizer: Recognizer, ordinal: Int)
|
||||
|
||||
external fun vosk_recognizer_set_endpointer_delays(
|
||||
recognizer: Recognizer,
|
||||
tStartMax: Float,
|
||||
tEnd: Float,
|
||||
tMax: Float
|
||||
)
|
||||
}
|
||||
@ -327,4 +327,27 @@ actual class Recognizer : Freeable, PointerType, AutoCloseable {
|
||||
free()
|
||||
}
|
||||
|
||||
/**
|
||||
* Set endpointer scaling factor
|
||||
*
|
||||
* @param mode Endpointer mode
|
||||
**/
|
||||
actual fun setEndPointerMode(mode: EndPointerMode) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_mode(this, mode.ordinal)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set endpointer delays
|
||||
*
|
||||
* @param tStartMax timeout for stopping recognition in case of initial silence (usually around 5.0)
|
||||
* @param tEnd timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
|
||||
* @param tMax timeout for forcing utterance end in milliseconds (usually around 20-30)
|
||||
**/
|
||||
actual fun setEndPointerDelays(
|
||||
tStartMax: Float,
|
||||
tEnd: Float,
|
||||
tMax: Float
|
||||
) {
|
||||
LibVosk.vosk_recognizer_set_endpointer_delays(this, tStartMax, tEnd, tMax)
|
||||
}
|
||||
}
|
||||
51
kotlin/src/jvmMain/kotlin/org/vosk/TextProcessor.kt
Normal file
51
kotlin/src/jvmMain/kotlin/org/vosk/TextProcessor.kt
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright 2024 Alpha Cephei Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.vosk
|
||||
|
||||
import com.sun.jna.PointerType
|
||||
|
||||
/**
|
||||
* Inverse text normalization
|
||||
*
|
||||
* @since 2024/06/19
|
||||
*/
|
||||
actual class TextProcessor :
|
||||
Freeable, PointerType, AutoCloseable {
|
||||
|
||||
/**
|
||||
* Create text processor
|
||||
*/
|
||||
actual constructor(tagger: Char, verbalizer: Char) :
|
||||
super(LibVosk.vosk_text_processor_new(tagger, verbalizer))
|
||||
|
||||
|
||||
/** Release text processor */
|
||||
actual override fun free() {
|
||||
LibVosk.vosk_text_processor_free(this)
|
||||
}
|
||||
|
||||
/** Convert string */
|
||||
actual fun itn(input: Char): Char =
|
||||
LibVosk.vosk_text_processor_itn(this, input)
|
||||
|
||||
/**
|
||||
* @see free
|
||||
*/
|
||||
override fun close() {
|
||||
free()
|
||||
}
|
||||
}
|
||||
@ -22,4 +22,4 @@ linkerOpts.linux = \
|
||||
|
||||
linkerOpts.linux_x64 = \
|
||||
-L/usr/lib64/ \
|
||||
-L/usr/local/lib64/
|
||||
-L/usr/local/lib64/
|
||||
|
||||
@ -69,16 +69,18 @@ const vosk_recognizer_ptr = ref.refType(vosk_recognizer);
|
||||
|
||||
let soname;
|
||||
if (os.platform() == 'win32') {
|
||||
// Update path to load dependent dlls
|
||||
let currentPath = process.env.Path;
|
||||
let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86_64"));
|
||||
process.env.Path = dllDirectory + path.delimiter + currentPath;
|
||||
// Update path to load dependent dlls
|
||||
let currentPath = process.env.Path;
|
||||
let dllDirectory = path.resolve(path.join(__dirname, 'lib', 'win-x86_64'));
|
||||
process.env.Path = dllDirectory + path.delimiter + currentPath;
|
||||
|
||||
soname = path.join(__dirname, "lib", "win-x86_64", "libvosk.dll")
|
||||
soname = path.join(__dirname, 'lib', 'win-x86_64', 'libvosk.dll');
|
||||
} else if (os.platform() == 'darwin') {
|
||||
soname = path.join(__dirname, "lib", "osx-universal", "libvosk.dylib")
|
||||
soname = path.join(__dirname, 'lib', 'osx-universal', 'libvosk.dylib');
|
||||
} else if (os.platform() == 'linux' && os.arch() == 'arm64') {
|
||||
soname = path.join(__dirname, 'lib', 'linux-arm64', 'libvosk.so');
|
||||
} else {
|
||||
soname = path.join(__dirname, "lib", "linux-x86_64", "libvosk.so")
|
||||
soname = path.join(__dirname, 'lib', 'linux-x86_64', 'libvosk.so');
|
||||
}
|
||||
|
||||
const libvosk = ffi.Library(soname, {
|
||||
@ -128,6 +130,9 @@ class Model {
|
||||
* @type {unknown}
|
||||
*/
|
||||
this.handle = libvosk.vosk_model_new(modelPath);
|
||||
if (!this.handle) {
|
||||
throw new Error('Failed to create a model.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -161,6 +166,9 @@ class SpeakerModel {
|
||||
* @type {unknown}
|
||||
*/
|
||||
this.handle = libvosk.vosk_spk_model_new(modelPath);
|
||||
if (!this.handle) {
|
||||
throw new Error('Failed to create a speaker model.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -235,6 +243,10 @@ class Recognizer {
|
||||
: hasOwnProperty(param, 'grammar')
|
||||
? libvosk.vosk_recognizer_new_grm(model.handle, sampleRate, JSON.stringify(param.grammar))
|
||||
: libvosk.vosk_recognizer_new(model.handle, sampleRate);
|
||||
|
||||
if (!this.handle) {
|
||||
throw new Error('Failed to create a recognizer.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vosk",
|
||||
"version": "0.3.45",
|
||||
"version": "0.3.75",
|
||||
"description": "Node binding for continuous offline voice recoginition with Vosk library.",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
||||
11
python/example/test_itn.py
Executable file
11
python/example/test_itn.py
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import wave
|
||||
import sys
|
||||
|
||||
from vosk import Processor
|
||||
|
||||
proc = Processor("ru_itn_tagger.fst", "ru_itn_verbalizer.fst")
|
||||
print (proc.process("у нас десять яблок"))
|
||||
print (proc.process("у нас десять яблок и десять миллилитров воды точка"))
|
||||
print (proc.process("мы пришли в восемь часов пять минут"))
|
||||
@ -45,7 +45,7 @@ with open("README.md", "rb") as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name="vosk",
|
||||
version="0.3.46",
|
||||
version="0.3.75",
|
||||
author="Alpha Cephei Inc",
|
||||
author_email="contact@alphacephei.com",
|
||||
description="Offline open source speech recognition API based on Kaldi and Vosk",
|
||||
|
||||
@ -28,7 +28,7 @@ def recognize(line):
|
||||
|
||||
def main():
|
||||
p = Pool(8)
|
||||
texts = p.map(recognize, open(sys.argv[1], encoding="uft-8").readlines())
|
||||
texts = p.map(recognize, open(sys.argv[1], encoding="utf-8").readlines())
|
||||
print ("\n".join(texts))
|
||||
|
||||
main()
|
||||
|
||||
@ -287,3 +287,17 @@ class BatchRecognizer:
|
||||
|
||||
def GetPendingChunks(self):
|
||||
return _c.vosk_batch_recognizer_get_pending_chunks(self._handle)
|
||||
|
||||
class Processor:
|
||||
|
||||
def __init__(self, *args):
|
||||
self._handle = _c.vosk_text_processor_new(args[0].encode('utf-8'), args[1].encode('utf-8'))
|
||||
|
||||
if self._handle == _ffi.NULL:
|
||||
raise Exception("Failed to create processor")
|
||||
|
||||
def __del__(self):
|
||||
_c.vosk_text_processor_free(self._handle)
|
||||
|
||||
def process(self, text):
|
||||
return _ffi.string(_c.vosk_text_processor_itn(self._handle, text.encode('utf-8'))).decode('utf-8')
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Gem::Specification.new do |s|
|
||||
s.name = "vosk"
|
||||
s.version = "0.3.45"
|
||||
s.version = "0.3.75"
|
||||
s.summary = "Offline speech recognition API"
|
||||
s.description = "Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi, Czech, Polish. More to come."
|
||||
s.authors = ["Alpha Cephei Inc"]
|
||||
|
||||
@ -23,17 +23,18 @@ VOSK_SOURCES= \
|
||||
language_model.cc \
|
||||
model.cc \
|
||||
spk_model.cc \
|
||||
vosk_api.cc
|
||||
vosk_api.cc \
|
||||
postprocessor.cc
|
||||
|
||||
VOSK_HEADERS= \
|
||||
recognizer.h \
|
||||
language_model.h \
|
||||
model.h \
|
||||
spk_model.h \
|
||||
vosk_api.h
|
||||
vosk_api.h \
|
||||
postprocessor.h
|
||||
|
||||
CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LINKING \
|
||||
-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS)
|
||||
CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LINKING -I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS)
|
||||
|
||||
LDFLAGS=
|
||||
|
||||
|
||||
@ -46,9 +46,6 @@ void LanguageModelEstimator::AddCounts(const std::vector<int32> &sentence) {
|
||||
void LanguageModelEstimator::IncrementCount(const std::vector<int32> &history,
|
||||
int32 next_phone) {
|
||||
int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history);
|
||||
if (lm_states_[lm_state_index].tot_count == 0) {
|
||||
num_active_lm_states_++;
|
||||
}
|
||||
lm_states_[lm_state_index].AddCount(next_phone, 1);
|
||||
}
|
||||
|
||||
@ -106,6 +103,7 @@ int32 LanguageModelEstimator::FindOrCreateLmStateIndexForHistory(
|
||||
int32 backoff_lm_state = FindOrCreateLmStateIndexForHistory(backoff_hist);
|
||||
lm_states_[ans].backoff_lmstate_index = backoff_lm_state;
|
||||
}
|
||||
num_active_lm_states_++;
|
||||
return ans;
|
||||
}
|
||||
|
||||
@ -156,12 +154,13 @@ int32 LanguageModelEstimator::FindInitialFstState() const {
|
||||
|
||||
void LanguageModelEstimator::OutputToFst(
|
||||
int32 num_states,
|
||||
fst::StdVectorFst *fst) const {
|
||||
fst::StdVectorFst *out_fst) const {
|
||||
KALDI_ASSERT(num_states == num_active_lm_states_);
|
||||
fst->DeleteStates();
|
||||
fst::StdVectorFst fst;
|
||||
|
||||
for (int32 i = 0; i < num_states; i++)
|
||||
fst->AddState();
|
||||
fst->SetStart(FindInitialFstState());
|
||||
fst.AddState();
|
||||
fst.SetStart(FindInitialFstState());
|
||||
|
||||
int64 tot_count = 0;
|
||||
double tot_logprob = 0.0;
|
||||
@ -184,28 +183,29 @@ void LanguageModelEstimator::OutputToFst(
|
||||
tot_count += count;
|
||||
tot_logprob += logprob * count;
|
||||
if (phone == 0) { // Go to final state
|
||||
fst->SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob));
|
||||
fst.SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob));
|
||||
} else { // It becomes a transition.
|
||||
std::vector<int32> next_history(lm_state.history);
|
||||
next_history.push_back(phone);
|
||||
int32 dest_lm_state = FindNonzeroLmStateIndexForHistory(next_history),
|
||||
dest_fst_state = lm_states_[dest_lm_state].fst_state;
|
||||
KALDI_ASSERT(dest_fst_state != -1);
|
||||
fst->AddArc(lm_state.fst_state,
|
||||
fst.AddArc(lm_state.fst_state,
|
||||
fst::StdArc(phone, phone, fst::TropicalWeight(-logprob),
|
||||
dest_fst_state));
|
||||
}
|
||||
}
|
||||
if (lm_state.backoff_lmstate_index >= 0) {
|
||||
fst->AddArc(lm_state.fst_state, fst::StdArc(0, 0, fst::TropicalWeight(-log(1 - opts_.discount)), lm_states_[lm_state.backoff_lmstate_index].fst_state));
|
||||
fst.AddArc(lm_state.fst_state, fst::StdArc(0, 0, fst::TropicalWeight(-log(1 - opts_.discount)), lm_states_[lm_state.backoff_lmstate_index].fst_state));
|
||||
}
|
||||
}
|
||||
fst::Connect(fst);
|
||||
// Make sure that Connect does not delete any states.
|
||||
int32 num_states_connected = fst->NumStates();
|
||||
KALDI_ASSERT(num_states_connected == num_states);
|
||||
fst::DeterminizeOptions<fst::StdArc> opts;
|
||||
fst::Determinize(fst, out_fst, opts);
|
||||
fst::Connect(out_fst);
|
||||
// arc-sort. ilabel or olabel doesn't matter, it's an acceptor.
|
||||
fst::ArcSort(fst, fst::ILabelCompare<fst::StdArc>());
|
||||
KALDI_LOG << "Created language model with " << num_states
|
||||
<< " states and " << fst::NumArcs(*fst) << " arcs.";
|
||||
fst::ArcSort(out_fst, fst::ILabelCompare<fst::StdArc>());
|
||||
KALDI_LOG << "Created language model with " << out_fst->NumStates()
|
||||
<< " states and " << fst::NumArcs(*out_fst) << " arcs.";
|
||||
KALDI_LOG << "Originally language model with " << fst.NumStates()
|
||||
<< " states and " << fst::NumArcs(fst) << " arcs.";
|
||||
}
|
||||
|
||||
66
src/postprocessor.cc
Normal file
66
src/postprocessor.cc
Normal file
@ -0,0 +1,66 @@
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "postprocessor.h"
|
||||
|
||||
using fst::TokenType;
|
||||
|
||||
Processor::Processor(const std::string& tagger_path,
|
||||
const std::string& verbalizer_path) {
|
||||
tagger_.reset(StdVectorFst::Read(tagger_path));
|
||||
verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
|
||||
compiler_ = std::make_shared<StringCompiler<StdArc>>(TokenType::BYTE);
|
||||
printer_ = std::make_shared<StringPrinter<StdArc>>(TokenType::BYTE);
|
||||
}
|
||||
|
||||
std::string Processor::ShortestPath(const StdVectorFst& lattice) {
|
||||
StdVectorFst shortest_path;
|
||||
fst::ShortestPath(lattice, &shortest_path, 1, true);
|
||||
|
||||
std::string output;
|
||||
printer_->operator()(shortest_path, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string Processor::Compose(const std::string& input,
|
||||
const StdVectorFst* fst) {
|
||||
StdVectorFst input_fst;
|
||||
compiler_->operator()(input, &input_fst);
|
||||
|
||||
StdVectorFst lattice;
|
||||
fst::Compose(input_fst, *fst, &lattice);
|
||||
return ShortestPath(lattice);
|
||||
}
|
||||
|
||||
std::string Processor::Tag(const std::string& input) {
|
||||
if (input.empty()) {
|
||||
return "";
|
||||
}
|
||||
return Compose(input, tagger_.get());
|
||||
}
|
||||
|
||||
std::string Processor::Verbalize(const std::string& input) {
|
||||
if (input.empty()) {
|
||||
return "";
|
||||
}
|
||||
std::string output = Compose(input, verbalizer_.get());
|
||||
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string Processor::Normalize(const std::string& input) {
|
||||
return Verbalize(Tag(input));
|
||||
}
|
||||
|
||||
|
||||
45
src/postprocessor.h
Normal file
45
src/postprocessor.h
Normal file
@ -0,0 +1,45 @@
|
||||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef PROCESSOR_WETEXT_PROCESSOR_H_
|
||||
#define PROCESSOR_WETEXT_PROCESSOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "fst/fstlib.h"
|
||||
|
||||
using fst::StdArc;
|
||||
using fst::StdVectorFst;
|
||||
using fst::StringCompiler;
|
||||
using fst::StringPrinter;
|
||||
|
||||
class Processor {
|
||||
public:
|
||||
Processor(const std::string& tagger_path, const std::string& verbalizer_path);
|
||||
std::string Tag(const std::string& input);
|
||||
std::string Verbalize(const std::string& input);
|
||||
std::string Normalize(const std::string& input);
|
||||
|
||||
private:
|
||||
std::string ShortestPath(const StdVectorFst& lattice);
|
||||
std::string Compose(const std::string& input, const StdVectorFst* fst);
|
||||
|
||||
std::shared_ptr<StdVectorFst> tagger_ = nullptr;
|
||||
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
|
||||
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
|
||||
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
|
||||
};
|
||||
|
||||
#endif // PROCESSOR_WETEXT_PROCESSOR_H_
|
||||
@ -247,8 +247,8 @@ void Recognizer::SetEndpointerDelays(float t_start_max, float t_end, float t_max
|
||||
|
||||
rule1 = t_start_max;
|
||||
rule2 = t_end;
|
||||
rule3 = t_end * 1.5;
|
||||
rule4 = t_end * 2;
|
||||
rule3 = t_end + 0.5;
|
||||
rule4 = t_end + 1.0;
|
||||
rule5 = t_max;
|
||||
|
||||
KALDI_LOG << "Updating endpointer delays " << rule1 << "," << rule2 << "," << rule3 << "," << rule4 << "," << rule5;
|
||||
@ -275,7 +275,7 @@ void Recognizer::SetSpkModel(SpkModel *spk_model)
|
||||
void Recognizer::SetGrm(char const *grammar)
|
||||
{
|
||||
if (state_ == RECOGNIZER_RUNNING) {
|
||||
KALDI_ERR << "Can't add speaker model to already running recognizer";
|
||||
KALDI_ERR << "Can't add grammar to already running recognizer";
|
||||
return;
|
||||
}
|
||||
|
||||
@ -355,6 +355,7 @@ void Recognizer::UpdateGrammarFst(char const *grammar)
|
||||
}
|
||||
estimator.AddCounts(sentence);
|
||||
}
|
||||
delete g_fst_;
|
||||
g_fst_ = new StdVectorFst();
|
||||
estimator.Estimate(g_fst_);
|
||||
|
||||
@ -698,6 +699,17 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
|
||||
obj["alternatives"].append(entry);
|
||||
}
|
||||
|
||||
if (spk_model_) {
|
||||
Vector<BaseFloat> xvector;
|
||||
int num_spk_frames;
|
||||
if (GetSpkVector(xvector, &num_spk_frames)) {
|
||||
for (int i = 0; i < xvector.Dim(); i++) {
|
||||
obj["spk"].append(xvector(i));
|
||||
}
|
||||
obj["spk_frames"] = num_spk_frames;
|
||||
}
|
||||
}
|
||||
|
||||
return StoreReturn(obj.dump());
|
||||
}
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#include "recognizer.h"
|
||||
#include "model.h"
|
||||
#include "spk_model.h"
|
||||
#include "postprocessor.h"
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include "cudamatrix/cu-device.h"
|
||||
@ -304,3 +305,28 @@ int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer)
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
VoskTextProcessor *vosk_text_processor_new(const char *tagger, const char *verbalizer)
|
||||
{
|
||||
try {
|
||||
return (VoskTextProcessor *)new Processor(tagger, verbalizer);
|
||||
} catch (...) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void vosk_text_processor_free(VoskTextProcessor *processor)
|
||||
{
|
||||
delete ((Processor *)processor);
|
||||
}
|
||||
|
||||
char *vosk_text_processor_itn(VoskTextProcessor *processor, const char *input)
|
||||
{
|
||||
Processor *wprocessor = (Processor *)processor;
|
||||
std::string sinput(input);
|
||||
|
||||
std::string tagged_text = wprocessor->Tag(sinput);
|
||||
std::string normalized_text = wprocessor->Verbalize(tagged_text);
|
||||
|
||||
return strdup(normalized_text.c_str());
|
||||
}
|
||||
|
||||
@ -39,6 +39,8 @@ typedef struct VoskSpkModel VoskSpkModel;
|
||||
* speaker information and so on */
|
||||
typedef struct VoskRecognizer VoskRecognizer;
|
||||
|
||||
/** Inverse text normalization */
|
||||
typedef struct VoskTextProcessor VoskTextProcessor;
|
||||
|
||||
/**
|
||||
* Batch model object
|
||||
@ -376,6 +378,15 @@ void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
|
||||
/** Get amount of pending chunks for more intelligent waiting */
|
||||
int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
|
||||
|
||||
/** Create text processor */
|
||||
VoskTextProcessor *vosk_text_processor_new(const char *tagger, const char *verbalizer);
|
||||
|
||||
/** Release text processor */
|
||||
void vosk_text_processor_free(VoskTextProcessor *processor);
|
||||
|
||||
/** Convert string */
|
||||
char *vosk_text_processor_itn(VoskTextProcessor *processor, const char *input);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1,3 +1,120 @@
|
||||
A proper simple setup to train a Vosk model
|
||||
# Vosk API Training
|
||||
|
||||
More documentation later
|
||||
This directory contains scripts and tools for training speech recognition models using the Kaldi toolkit.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Overview](#overview)
|
||||
2. [Directory Structure](#directory-structure)
|
||||
3. [Installation](#installation)
|
||||
4. [Training Process](#training-process)
|
||||
- [Data Preparation](#data-preparation)
|
||||
- [Dictionary Preparation](#dictionary-preparation)
|
||||
- [MFCC Feature Extraction](#mfcc-feature-extraction)
|
||||
- [Acoustic Model Training](#acoustic-model-training)
|
||||
- [TDNN Chain Model Training](#tdnn-chain-model-training)
|
||||
- [Decoding](#decoding)
|
||||
5. [Results](#results)
|
||||
6. [Contributing](#contributing)
|
||||
|
||||
## Overview
|
||||
|
||||
This repository provides tools for training custom speech recognition models using Kaldi. It supports acoustic model training, language model creation, and decoding pipelines.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```plaintext
|
||||
.
|
||||
├── cmd.sh # Command configuration for training and decoding
|
||||
├── conf/
|
||||
│ ├── mfcc.conf # Configuration for MFCC feature extraction
|
||||
│ └── online_cmvn.conf # Online Cepstral Mean Variance Normalization (currently empty)
|
||||
├── local/
|
||||
│ ├── chain/
|
||||
│ │ ├── run_ivector_common.sh # Script for i-vector extraction during chain model training
|
||||
│ │ └── run_tdnn.sh # Script for training a TDNN model
|
||||
│ ├── data_prep.sh # Data preparation script for creating Kaldi data directories
|
||||
│ ├── download_and_untar.sh # Script for downloading and extracting datasets
|
||||
│ ├── download_lm.sh # Downloads language models
|
||||
│ ├── prepare_dict.sh # Prepares the pronunciation dictionary
|
||||
│ └── score.sh # Scoring script for evaluation
|
||||
├── path.sh # Script for setting Kaldi paths
|
||||
├── RESULTS # Script for printing the best WER results
|
||||
├── RESULTS.txt # Contains WER results from decoding
|
||||
├── run.sh # Main script for the entire training pipeline
|
||||
├── steps -> ../../wsj/s5/steps/ # Link to Kaldi’s WSJ steps for acoustic model training
|
||||
└── utils -> ../../wsj/s5/utils/ # Link to Kaldi’s utility scripts
|
||||
```
|
||||
|
||||
### Key Files:
|
||||
- **cmd.sh**: Defines commands for running training and decoding tasks.
|
||||
- **path.sh**: Sets up paths for Kaldi binaries and scripts.
|
||||
- **run.sh**: Main entry point for the training pipeline, running tasks in stages.
|
||||
- **RESULTS**: Displays Word Error Rate (WER) for the trained models.
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
- [Kaldi](https://github.com/kaldi-asr/kaldi): Kaldi toolkit must be installed and configured.
|
||||
- Required tools: `ffmpeg`, `sox`, `sctk` for data preparation and scoring.
|
||||
|
||||
### Steps
|
||||
1. Clone the Vosk API repository.
|
||||
2. Install Kaldi and ensure the `KALDI_ROOT` is correctly set in `path.sh`.
|
||||
3. Set environment variables using `cmd.sh` and `path.sh`.
|
||||
|
||||
## Training Process
|
||||
|
||||
### Data Preparation
|
||||
Run the data preparation stage in `run.sh`:
|
||||
```bash
|
||||
bash run.sh --stage 0 --stop_stage 0
|
||||
```
|
||||
This stage downloads and prepares the LibriSpeech dataset.
|
||||
|
||||
### Dictionary Preparation
|
||||
Prepare the pronunciation dictionary with:
|
||||
```bash
|
||||
bash run.sh --stage 1 --stop_stage 1
|
||||
```
|
||||
This step generates the necessary files for Kaldi's `prepare_lang.sh` script.
|
||||
|
||||
### MFCC Feature Extraction
|
||||
Run the MFCC extraction process:
|
||||
```bash
|
||||
bash run.sh --stage 2 --stop_stage 2
|
||||
```
|
||||
This step extracts Mel-frequency cepstral coefficients (MFCC) features and computes Cepstral Mean Variance Normalization (CMVN).
|
||||
|
||||
### Acoustic Model Training
|
||||
Train monophone, LDA+MLLT, and SAT models:
|
||||
```bash
|
||||
bash run.sh --stage 3 --stop_stage 3
|
||||
```
|
||||
This stage trains GMM-based models and aligns the data for TDNN training.
|
||||
|
||||
### TDNN Chain Model Training
|
||||
Train a Time-Delay Neural Network (TDNN) chain model:
|
||||
```bash
|
||||
bash run.sh --stage 4 --stop_stage 4
|
||||
```
|
||||
The chain model uses i-vectors for speaker adaptation.
|
||||
|
||||
### Decoding
|
||||
After training, decode the test data:
|
||||
```bash
|
||||
bash run.sh --stage 5 --stop_stage 5
|
||||
```
|
||||
This step decodes using the trained model and evaluates the Word Error Rate (WER).
|
||||
|
||||
## Results
|
||||
|
||||
WER can be evaluated by running:
|
||||
```bash
|
||||
bash RESULTS
|
||||
```
|
||||
Example of `RESULTS.txt`:
|
||||
```plaintext
|
||||
%WER 14.10 [ 2839 / 20138, 214 ins, 487 del, 2138 sub ] exp/chain/tdnn/decode_test/wer_11_0.0
|
||||
%WER 12.67 [ 2552 / 20138, 215 ins, 406 del, 1931 sub ] exp/chain/tdnn/decode_test_rescore/wer_11_0.0
|
||||
```
|
||||
@ -44,6 +44,7 @@ RUN cd /opt \
|
||||
&& cd /opt/kaldi/src \
|
||||
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
|
||||
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
|
||||
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
|
||||
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
|
||||
&& make -j 10 online2 lm rnnlm \
|
||||
&& make -j 10 online2 rnnlm \
|
||||
&& find /opt/kaldi -name "*.o" -exec rm {} \;
|
||||
|
||||
@ -35,5 +35,5 @@ RUN cd /opt \
|
||||
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
|
||||
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
|
||||
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
|
||||
&& make -j 10 online2 lm rnnlm \
|
||||
&& make -j 10 online2 rnnlm \
|
||||
&& find /opt/kaldi -name "*.o" -exec rm {} \;
|
||||
|
||||
50
travis/Dockerfile.dockcross-musl
Normal file
50
travis/Dockerfile.dockcross-musl
Normal file
@ -0,0 +1,50 @@
|
||||
ARG DOCKCROSS_IMAGE=alphacep/dockcross-linux-armv7
|
||||
FROM ${DOCKCROSS_IMAGE}
|
||||
|
||||
LABEL description="A docker image for building portable Python linux binary wheels and Kaldi on other architectures"
|
||||
LABEL maintainer="contact@alphacephei.com"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
libffi-dev \
|
||||
libpcre3-dev \
|
||||
zlib1g-dev \
|
||||
automake \
|
||||
autoconf \
|
||||
libtool \
|
||||
cmake \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
python3-setuptools \
|
||||
python3-cffi \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG OPENBLAS_ARGS=
|
||||
RUN cd /opt \
|
||||
&& git clone -b vosk --single-branch https://github.com/alphacep/kaldi \
|
||||
&& cd kaldi/tools \
|
||||
&& git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS \
|
||||
&& git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack \
|
||||
&& echo ${OPENBLAS_ARGS} \
|
||||
&& make -C OpenBLAS ONLY_CBLAS=1 ${OPENBLAS_ARGS} HOSTCC=gcc USE_LOCKING=1 USE_THREAD=0 all \
|
||||
&& make -C OpenBLAS ${OPENBLAS_ARGS} HOSTCC=gcc USE_LOCKING=1 USE_THREAD=0 PREFIX=$(pwd)/OpenBLAS/install install \
|
||||
&& mkdir -p clapack/BUILD && cd clapack/BUILD && cmake .. \
|
||||
&& make -j 10 -C F2CLIBS \
|
||||
&& make -j 10 -C BLAS \
|
||||
&& make -j 10 -C SRC \
|
||||
&& find . -name "*.a" | xargs cp -t ../../OpenBLAS/install/lib \
|
||||
&& cd /opt/kaldi/tools \
|
||||
&& git clone --single-branch https://github.com/alphacep/openfst openfst \
|
||||
&& cd openfst \
|
||||
&& autoreconf -i \
|
||||
&& CFLAGS="-g -O3" ./configure --prefix=/opt/kaldi/tools/openfst --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --disable-bin --host=${CROSS_TRIPLE} --build=x86-linux-gnu \
|
||||
&& make -j 10 && make install \
|
||||
&& cd /opt/kaldi/src \
|
||||
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
|
||||
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
|
||||
&& sed -i "s:-DHAVE_EXECINFO_H=1::g" makefiles/linux_openblas_arm.mk \
|
||||
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
|
||||
&& make -j 10 online2 rnnlm \
|
||||
&& find /opt/kaldi -name "*.o" -exec rm {} \;
|
||||
@ -30,5 +30,5 @@ RUN cd /opt \
|
||||
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
|
||||
&& sed -i 's:-msse -msse2:-msse -msse2:g' kaldi.mk \
|
||||
&& sed -i 's: -O1 : -O3 :g' kaldi.mk \
|
||||
&& make -j $(nproc) online2 lm rnnlm \
|
||||
&& make -j $(nproc) online2 rnnlm \
|
||||
&& find /opt/kaldi -name "*.o" -exec rm {} \;
|
||||
|
||||
@ -27,5 +27,5 @@ RUN cd /opt \
|
||||
&& ./configure --mathlib=MKL --shared --use-cuda=no \
|
||||
&& sed -i 's:-msse -msse2:-msse -msse2 -mavx -mavx2:g' kaldi.mk \
|
||||
&& sed -i 's: -O1 : -O3 :g' kaldi.mk \
|
||||
&& make -j $(nproc) online2 lm rnnlm \
|
||||
&& make -j $(nproc) online2 rnnlm \
|
||||
&& find /opt/kaldi -name "*.o" -exec rm {} \;
|
||||
|
||||
@ -62,4 +62,4 @@ RUN cd /opt/kaldi \
|
||||
--host=x86_64-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
|
||||
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
|
||||
&& make depend -j \
|
||||
&& make -j $(nproc) online2 lm rnnlm
|
||||
&& make -j $(nproc) online2 rnnlm
|
||||
|
||||
@ -61,4 +61,4 @@ RUN cd /opt/kaldi \
|
||||
--host=i686-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
|
||||
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
|
||||
&& make depend -j \
|
||||
&& make -j $(nproc) online2 lm rnnlm
|
||||
&& make -j $(nproc) online2 rnnlm
|
||||
|
||||
70
travis/Dockerfile.winaarch64
Normal file
70
travis/Dockerfile.winaarch64
Normal file
@ -0,0 +1,70 @@
|
||||
FROM ubuntu:20.04
|
||||
|
||||
RUN apt-get update && \
|
||||
DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
g++ \
|
||||
bzip2 \
|
||||
unzip \
|
||||
make \
|
||||
wget \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
python3-setuptools \
|
||||
python3-cffi \
|
||||
zlib1g-dev \
|
||||
patch \
|
||||
cmake \
|
||||
xz-utils \
|
||||
automake \
|
||||
autoconf \
|
||||
libtool \
|
||||
pkg-config \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN cd /opt && \
|
||||
wget https://github.com/mstorsjo/llvm-mingw/releases/download/20240820/llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64.tar.xz \
|
||||
&& tar xf llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64.tar.xz \
|
||||
&& mv llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64 llvm-mingw
|
||||
|
||||
ENV PATH="$PATH:/opt/llvm-mingw/bin"
|
||||
|
||||
RUN mkdir /opt/kaldi \
|
||||
&& git clone https://github.com/alphacep/openfst \
|
||||
&& cd openfst \
|
||||
&& autoreconf -i \
|
||||
&& CXX=aarch64-w64-mingw32-g++ CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" \
|
||||
./configure --prefix=/opt/kaldi/local \
|
||||
--enable-shared --enable-static --with-pic --disable-bin \
|
||||
--enable-lookahead-fsts --enable-ngram-fsts --host=aarch64-w64-mingw32 \
|
||||
&& make -j $(nproc) \
|
||||
&& make install
|
||||
|
||||
RUN cd /opt/kaldi \
|
||||
&& git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS \
|
||||
&& cd OpenBLAS \
|
||||
&& make HOSTCC=gcc CC=aarch64-w64-mingw32-gcc ONLY_CBLAS=1 USE_LOCKING=1 USE_THREAD=0 USE_OPENMP=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 -j $(nproc) \
|
||||
&& make PREFIX=/opt/kaldi/local install
|
||||
|
||||
RUN cd /opt/kaldi \
|
||||
&& git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack \
|
||||
&& mkdir clapack/BUILD \
|
||||
&& cd clapack/BUILD \
|
||||
&& cmake -DCMAKE_C_COMPILER_TARGET=aarch64-w64-mingw32 -DCMAKE_C_COMPILER=aarch64-w64-mingw32-gcc -DCMAKE_SYSTEM_NAME=Windows -DCMAKE_CROSSCOMPILING=True .. \
|
||||
&& make -C F2CLIBS/libf2c \
|
||||
&& make -C BLAS \
|
||||
&& make -C SRC \
|
||||
&& find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
|
||||
|
||||
RUN cd /opt/kaldi \
|
||||
&& git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
|
||||
&& cd kaldi/src \
|
||||
&& CXX=aarch64-w64-mingw32-g++ CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
|
||||
--mathlib=OPENBLAS_CLAPACK \
|
||||
--host=aarch64-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
|
||||
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
|
||||
&& make depend -j \
|
||||
&& make LLVM_BUILD=1 -j $(nproc) online2 rnnlm
|
||||
7
travis/build-dockcross-musl.sh
Executable file
7
travis/build-dockcross-musl.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
docker build --build-arg="DOCKCROSS_IMAGE=dockcross/linux-armv7l-musl" --build-arg="OPENBLAS_ARGS=TARGET=ARMV7" --file Dockerfile.dockcross-musl --tag alphacep/kaldi-dockcross-armv7-musl:latest .
|
||||
docker run --rm -v /home/shmyrev/travis/vosk-api/:/io alphacep/kaldi-dockcross-armv7-musl /io/travis/build-wheels-dockcross.sh
|
||||
5
travis/build-docker-winaarch64.sh
Executable file
5
travis/build-docker-winaarch64.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e -x
|
||||
docker build --file Dockerfile.winaarch64 --tag alphacep/kaldi-winaarch64:latest .
|
||||
docker run --rm -v `realpath ..`:/io alphacep/kaldi-winaarch64 /io/travis/build-wheels-winaarch64.sh
|
||||
@ -14,6 +14,11 @@ case $CROSS_TRIPLE in
|
||||
export VOSK_MACHINE=armv7l
|
||||
export VOSK_ARCHITECTURE=32bit
|
||||
;;
|
||||
*armv7l-linux-musleabihf*)
|
||||
export VOSK_MACHINE=armv7l
|
||||
export VOSK_ARCHITECTURE=32bit
|
||||
export VOSK_VARIANT="-musl"
|
||||
;;
|
||||
*i686-*)
|
||||
export VOSK_MACHINE=x86
|
||||
export VOSK_ARCHITECTURE=32bit
|
||||
@ -29,9 +34,9 @@ case $CROSS_TRIPLE in
|
||||
esac
|
||||
|
||||
# Copy library to output folder
|
||||
mkdir -p /io/wheelhouse/vosk-linux-$VOSK_MACHINE
|
||||
cp /opt/vosk-api/src/*.so /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-linux-$VOSK_MACHINE
|
||||
mkdir -p /io/wheelhouse/vosk-linux-${VOSK_MACHINE}${VOSK_VARIANT}
|
||||
cp /opt/vosk-api/src/*.so /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-linux-$VOSK_MACHINE${VOSK_VARIANT}
|
||||
|
||||
# Build wheel
|
||||
python3 -m pip install requests tqdm srt websockets wheel
|
||||
python3 -m pip install requests tqdm srt websockets wheel --break-system-packages
|
||||
python3 -m pip wheel /opt/vosk-api/python --no-deps -w /io/wheelhouse
|
||||
|
||||
18
travis/build-wheels-winaarch64.sh
Executable file
18
travis/build-wheels-winaarch64.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
set -e -x
|
||||
|
||||
# Build libvosk
|
||||
cd /opt
|
||||
git clone https://github.com/alphacep/vosk-api
|
||||
cd vosk-api/src
|
||||
EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=aarch64-w64-mingw32-g++ EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
|
||||
|
||||
# Copy dlls to output folder
|
||||
mkdir -p /io/wheelhouse/vosk-winaarch64
|
||||
cp /opt/vosk-api/src/*.{dll,lib} /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-winaarch64
|
||||
|
||||
# Build wheel and put to the output folder
|
||||
export VOSK_SOURCE=/opt/vosk-api
|
||||
export VOSK_SYSTEM=Windows
|
||||
export VOSK_ARCHITECTURE=64bit
|
||||
python3 -m pip -v wheel /opt/vosk-api/python --no-deps -w /io/wheelhouse
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vosk-js",
|
||||
"version": "0.3.45",
|
||||
"version": "0.3.75",
|
||||
"description": "Node binding for continuous voice recoginition through vosk-api.",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user