Compare commits

...

27 Commits

Author SHA1 Message Date
Nickolay Shmyrev
625e44c626 Bump JNA requirements to 5.18.1 and update version 2025-12-08 21:47:46 +01:00
Nickolay Shmyrev
488fc44d71 Add missing file to git, fixes #2001 2025-10-24 13:10:51 +02:00
Adam Jaso
a428d65966
Add Golang GPU Batch API and example (#1983)
Co-authored-by: ajaso <github@ajaso.net>
2025-09-10 16:13:51 +03:00
Nickolay Shmyrev
056a2ad548 Updates for recent android 2025-09-03 09:46:07 +02:00
Nickolay Shmyrev
07a0ddb467 Add endpointer mode to java and csharp API 2025-09-03 07:14:40 +02:00
resignedScientist
24f0e0cff8
add 16 KB page size support (#1976)
Co-authored-by: Norman Laudien <kontakt@norman-laudien.de>
2025-08-25 02:08:01 +03:00
Utkarsh Gual
0f364e3a44
Fix typo in encoding in transcribe_scp.py (#1955)
Fix typo in encoding in transcribe_scp.py. uft-8 -> utf-8
2025-07-11 23:02:00 +03:00
Nickolay Shmyrev
eabd80a848 Support MUSL variant in packging 2025-05-01 11:20:40 +02:00
Nickolay Shmyrev
780ba2f0b7 Add MUSL build 2025-05-01 11:06:56 +02:00
Nickolay Shmyrev
47509f7f9c Properly rebuilt fst 2025-05-01 09:05:17 +02:00
Md Husain Thekiya
4bf3370826
Fix: Raise exception in Node.js binding when model creation fails (#716) (#1714) 2025-03-05 19:16:56 +03:00
Matt Kenefick
cf67ed6cd9
Support ARM 64 in NodeJS (#1655)
Identify and support ARM64 in the NodeJS vosk-api
2024-11-14 00:23:16 +03:00
Nickolay Shmyrev
0979c46766 Add speaker vector to n-best results. Fixes issue #1647 2024-10-24 15:34:05 +02:00
Nickolay V. Shmyrev
eeab22ed98
Added postprocessor. Fixes #1641 2024-10-11 00:05:08 +03:00
Paschalis M
a9f27eb11d
Update README.md with training details (#1637) 2024-09-26 16:29:42 +03:00
Nickolay Shmyrev
1b308a3017 Added Win ARM64 build 2024-08-23 21:36:40 +02:00
Nickolay Shmyrev
f5540085b5 Add text processor wrapper for java 2024-08-22 03:13:44 +02:00
Clocks
c64c3daa3b
Kotlin updates (#1596)
* Add TextProcessor.kt

* Add Recognizer end pointer functions

* Properly close input stream

Mirror of 5d09ee8

* Bump android compile sdk to 34

* Update everything

All dependencies, etc
2024-06-19 22:22:50 +03:00
Nickolay Shmyrev
cc48ff9567 Properly count active states 2024-06-17 07:24:51 +02:00
johngebbie
7358c799b1
go: Add SetEndpointerDelays (#1570) 2024-05-06 08:33:26 +03:00
Nickolay Shmyrev
a7bf6a51e2 Bump version 2024-04-22 14:39:57 +02:00
Nickolay Shmyrev
72797111db Fix timeouts for endpointer 2024-04-22 14:29:39 +02:00
__Rylex__
40937b6bcb
Kaldi remove lm target because rnnlm has lm (#1543)
* Update Dockerfile.win

* Update Dockerfile.dockcross

* Update Dockerfile.manylinux

* Update Dockerfile.dockcross-manylinux

* Update Dockerfile.manylinux-mkl

* Update Dockerfile.win32

* Update build-vosk.sh
2024-04-01 15:58:09 +03:00
Nickolay Shmyrev
7da70c6107 Add test for ITN 2024-03-29 12:57:01 +01:00
Nickolay Shmyrev
2426225d74 Add postprocessor 2024-03-29 12:44:54 +01:00
Nickolay V. Shmyrev
c4d32a2293
Inverse text normalization with FSTs (#1545)
* Add ITN from Wetext
2024-03-29 14:32:05 +03:00
__Rylex__
6f7fe0e417
fix typp (#1519) 2024-02-21 11:00:19 +03:00
64 changed files with 1085 additions and 103 deletions

View File

@ -10,6 +10,7 @@ add_library(vosk
src/recognizer.cc
src/spk_model.cc
src/vosk_api.cc
src/postprocessor.cc
)
find_package(kaldi REQUIRED)

View File

@ -4,13 +4,13 @@ buildscript {
mavenCentral()
}
dependencies {
classpath 'com.android.tools.build:gradle:7.4.0'
classpath 'com.vanniktech:gradle-maven-publish-plugin:0.24.0'
classpath 'com.android.tools.build:gradle:8.13.0'
classpath 'com.vanniktech:gradle-maven-publish-plugin:0.34.0'
}
}
allprojects {
version = '0.3.47'
version = '0.3.75'
}
subprojects {
@ -24,7 +24,7 @@ subprojects {
}
mavenPublishing {
publishToMavenCentral(com.vanniktech.maven.publish.SonatypeHost.S01, false)
publishToMavenCentral()
signAllPublications()
}

View File

@ -29,7 +29,7 @@ set -x
OS_NAME=`echo $(uname -s) | tr '[:upper:]' '[:lower:]'`
ANDROID_TOOLCHAIN_PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64
WORKDIR_BASE=`pwd`/build
PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64/bin:$PATH
PATH=$ANDROID_TOOLCHAIN_PATH/bin:$PATH
OPENFST_VERSION=1.8.0
for arch in armeabi-v7a arm64-v8a x86_64 x86; do
@ -45,6 +45,7 @@ case $arch in
CC=armv7a-linux-androideabi21-clang
CXX=armv7a-linux-androideabi21-clang++
ARCHFLAGS="-mfloat-abi=softfp -mfpu=neon"
PAGESIZE_LDFLAGS=""
;;
arm64-v8a)
BLAS_ARCH=ARMV8
@ -54,6 +55,8 @@ case $arch in
CC=aarch64-linux-android21-clang
CXX=aarch64-linux-android21-clang++
ARCHFLAGS=""
# Ensure compatibility with 16KiB page size devices
PAGESIZE_LDFLAGS="-Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=16384"
;;
x86_64)
BLAS_ARCH=ATOM
@ -63,6 +66,7 @@ case $arch in
CC=x86_64-linux-android21-clang
CXX=x86_64-linux-android21-clang++
ARCHFLAGS=""
PAGESIZE_LDFLAGS=""
;;
x86)
BLAS_ARCH=ATOM
@ -72,6 +76,7 @@ case $arch in
CC=i686-linux-android21-clang
CXX=i686-linux-android21-clang++
ARCHFLAGS=""
PAGESIZE_LDFLAGS=""
;;
esac
@ -79,16 +84,16 @@ mkdir -p $WORKDIR/local/lib
# openblas first
cd $WORKDIR
git clone -b v0.3.13 --single-branch https://github.com/xianyi/OpenBLAS
make -C OpenBLAS TARGET=$BLAS_ARCH ONLY_CBLAS=1 AR=$AR CC=$CC HOSTCC=gcc ARM_SOFTFP_ABI=1 USE_THREAD=0 NUM_THREADS=1 -j4
git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS
make -C OpenBLAS TARGET=$BLAS_ARCH ONLY_CBLAS=1 AR=$AR CC=$CC HOSTCC=gcc ARM_SOFTFP_ABI=1 USE_THREAD=0 NUM_THREADS=1 -j 8
make -C OpenBLAS install PREFIX=$WORKDIR/local
# CLAPACK
cd $WORKDIR
git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack
mkdir -p clapack/BUILD && cd clapack/BUILD
cmake -DCMAKE_C_FLAGS=$ARCHFLAGS -DCMAKE_C_COMPILER_TARGET=$HOST \
-DCMAKE_C_COMPILER=$CC -DCMAKE_SYSTEM_NAME=Generic -DCMAKE_AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/${OS_NAME}-x86_64/bin/$AR \
cmake -DCMAKE_C_FLAGS="$ARCHFLAGS" -DCMAKE_C_COMPILER_TARGET=$HOST \
-DCMAKE_C_COMPILER=$CC -DCMAKE_SYSTEM_NAME=Generic -DCMAKE_AR=$ANDROID_TOOLCHAIN_PATH/bin/$AR \
-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
-DCMAKE_CROSSCOMPILING=True ..
make -j 8 -C F2CLIBS/libf2c
@ -118,7 +123,7 @@ CXX=$CXX AR=$AR RANLIB=$RANLIB CXXFLAGS="$ARCHFLAGS -O3 -DFST_NO_DYNAMIC_LINKING
--fst-root=${WORKDIR}/local --fst-version=${OPENFST_VERSION}
make -j 8 depend
cd $WORKDIR/kaldi/src
make -j 8 online2 lm rnnlm
make -j 8 online2 rnnlm
# Vosk-api
cd $WORKDIR
@ -129,7 +134,7 @@ make -j 8 -C ${WORKDIR_BASE}/../../../src \
OPENFST_ROOT=${WORKDIR}/local \
OPENBLAS_ROOT=${WORKDIR}/local \
CXX=$CXX \
EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so"
EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so ${PAGESIZE_LDFLAGS}"
cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so
done

View File

@ -4,14 +4,14 @@ def pomDescription = "Vosk speech recognition library for Android"
android {
namespace 'org.vosk'
compileSdkVersion 33
compileSdkVersion 36
defaultConfig {
minSdkVersion 21
targetSdkVersion 33
targetSdkVersion 36
versionCode 10
versionName = version
archivesBaseName = archiveName
ndkVersion = "25.2.9519653"
ndkVersion = "28.2.13676358"
}
compileOptions {
sourceCompatibility JavaVersion.VERSION_1_8
@ -25,7 +25,7 @@ task buildVosk(type: Exec) {
}
dependencies {
api 'net.java.dev.jna:jna:5.13.0@aar'
api 'net.java.dev.jna:jna:5.18.1@aar'
}
//preBuild.dependsOn buildVosk

View File

@ -56,8 +56,18 @@ public class LibVosk {
public static native void vosk_recognizer_reset(Pointer recognizer);
public static native void vosk_recognizer_set_endpointer_mode(Pointer recognizer, int mode);
public static native void vosk_recognizer_set_endpointer_delays(Pointer recognizer, float t_start_max, float t_end, float t_max);
public static native void vosk_recognizer_free(Pointer recognizer);
public static native Pointer vosk_text_processor_new(String verbalizer, String tagger);
public static native void vosk_text_processor_free(Pointer processor);
public static native String vosk_text_processor_itn(Pointer processor, String input);
/**
* Set log level for Kaldi messages.
*

View File

@ -236,6 +236,34 @@ public class Recognizer extends PointerType implements AutoCloseable {
LibVosk.vosk_recognizer_reset(this.getPointer());
}
/**
* Endpointer delay mode
*/
public class EndpointerMode {
public static final int DEFAULT = 0;
public static final int SHORT = 1;
public static final int LONG = 2;
public static final int VERY_LONG = 3;
}
/**
* Configures endpointer mode for recognizer
*/
public void setEndpointerMode(int mode) {
LibVosk.vosk_recognizer_set_endpointer_mode(this.getPointer(), mode);
}
/**
* Set endpointer delays
*
* @param t_start_max timeout for stopping recognition in case of initial silence (usually around 5.0)
* @param t_end timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
* @param t_max timeout for forcing utterance end in milliseconds (usually around 20-30)
**/
public void setEndpointerDelays(float t_start_max, float t_end, float t_max) {
LibVosk.vosk_recognizer_set_endpointer_delays(this.getPointer(), t_start_max, t_end, t_max);
}
/**
* Releases recognizer object.
* Underlying model is also unreferenced and if needed, released.

View File

@ -0,0 +1,21 @@
package org.vosk;
import com.sun.jna.PointerType;
public class TextProcessor extends PointerType implements AutoCloseable {
public TextProcessor() {
}
public TextProcessor(String verbalizer, String tagger) {
super(LibVosk.vosk_text_processor_new(verbalizer, tagger));
}
@Override
public void close() {
LibVosk.vosk_text_processor_free(this.getPointer());
}
public String itn(String input) {
return LibVosk.vosk_text_processor_itn(this.getPointer(), input);
}
}

View File

@ -4,10 +4,10 @@ def pomDescription = "Small English model for Android"
android {
namespace "org.vosk"
compileSdkVersion 33
compileSdkVersion 36
defaultConfig {
minSdkVersion 21
targetSdkVersion 33
targetSdkVersion 36
versionCode 10
versionName = version
archivesBaseName = archiveName

View File

@ -28,6 +28,9 @@ public class VoskDemo
{
// Demo float array
VoskRecognizer rec = new VoskRecognizer(model, 16000.0f);
rec.SetEndpointerMode(EndpointerMode.LONG);
using(Stream source = File.OpenRead("test.wav")) {
byte[] buffer = new byte[4096];
int bytesRead;

View File

@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net5.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>VoskDemo</RootNamespace>
</PropertyGroup>
@ -11,7 +11,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Vosk" Version="0.3.45" />
<PackageReference Include="Vosk" Version="0.3.75" />
</ItemGroup>
</Project>

17
csharp/nuget/Vosk.csproj Normal file
View File

@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<PackageId>Vosk</PackageId>
<Version>0.3.75</Version>
<authors>Alpha Cephei Inc</authors>
<owners>Alpha Cephei Inc</owners>
</PropertyGroup>
<Target Name="CopyFiles" AfterTargets="Build">
<Copy SourceFiles="bin/Release/net8.0/Vosk.dll" DestinationFolder="lib/net8.0" />
</Target>
</Project>

View File

@ -2,7 +2,7 @@
<package>
<metadata>
<id>Vosk</id>
<version>0.3.45</version>
<version>0.3.75</version>
<authors>Alpha Cephei Inc</authors>
<owners>Alpha Cephei Inc</owners>
<license type="expression">Apache-2.0</license>
@ -23,10 +23,10 @@ Vosk scales from small devices like Raspberry Pi or Android smartphone to big cl
<copyright>Copyright 2020-2050 Alpha Cephei Inc</copyright>
<tags>speech recognition voice stt asr speech-to-text ai offline privacy</tags>
<dependencies>
<group targetFramework=".NETStandard2.0"/>
<group targetFramework="net8.0"/>
</dependencies>
</metadata>
<files>
<file src="**" exclude="src/*.cs;build.sh;**/.keep-me;*.nupkg" />
<file src="**" exclude="bin/**;obj/**;build.sh;src/*.cs;*.nupkg;**/.keep-me" />
</files>
</package>

View File

@ -1,2 +1,2 @@
mcs -out:lib/netstandard2.0/Vosk.dll -target:library src/*.cs
nuget pack
rm -rf bin lib obj
/home/shmyrev/local/dotnet/dotnet pack Vosk.csproj -p:NuspecFile=Vosk.nuspec -o .

View File

@ -65,6 +65,12 @@ class VoskPINVOKE {
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_reset")]
public static extern void VoskRecognizer_Reset(global::System.Runtime.InteropServices.HandleRef jarg1);
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_set_endpointer_mode")]
public static extern void VoskRecognizer_SetEndpointerMode(global::System.Runtime.InteropServices.HandleRef jarg1, int jarg2);
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_recognizer_set_endpointer_delays")]
public static extern void VoskRecognizer_SetEndpointerDelays(global::System.Runtime.InteropServices.HandleRef jarg1, float jarg2, float jarg3, float jarg4);
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint="vosk_set_log_level")]
public static extern void SetLogLevel(int jarg1);
@ -107,7 +113,6 @@ class VoskPINVOKE {
[global::System.Runtime.InteropServices.DllImport("libvosk", EntryPoint = "vosk_batch_recognizer_get_pending_chunks")]
public static extern int VoskBatchRecognizer_GetPendingChunks(global::System.Runtime.InteropServices.HandleRef jarg1);
}
}
}

View File

@ -1,5 +1,12 @@
namespace Vosk {
public enum EndpointerMode {
DEFAULT = 0,
SHORT = 1,
LONG = 2,
VERY_LONG = 3
}
public class VoskRecognizer : System.IDisposable {
private System.Runtime.InteropServices.HandleRef handle;
@ -91,6 +98,14 @@ public class VoskRecognizer : System.IDisposable {
VoskPINVOKE.VoskRecognizer_Reset(handle);
}
public void SetEndpointerMode(EndpointerMode mode) {
VoskPINVOKE.VoskRecognizer_SetEndpointerMode(handle, (int) mode);
}
public void SetEndpointerDelays(float t_start_max, float t_end, float t_max) {
VoskPINVOKE.VoskRecognizer_SetEndpointerDelays(handle, t_start_max, t_end, t_max);
}
}
}

99
go/batch.go Normal file
View File

@ -0,0 +1,99 @@
package vosk
// #cgo CPPFLAGS: -I ${SRCDIR}/../src
// #cgo !windows LDFLAGS: -L ${SRCDIR}/../src -lvosk -ldl -lpthread
// #cgo windows LDFLAGS: -L ${SRCDIR}/../src -lvosk -lpthread
// #include <stdlib.h>
// #include <vosk_api.h>
import "C"
import "unsafe"
// VoskBatchModel contains a reference to the C VoskBatchModel
type VoskBatchModel struct {
model *C.struct_VoskBatchModel
}
// NewBatchModel creates a new VoskBatchModel instance
func NewBatchModel(modelPath string) (*VoskBatchModel, error) {
cmodelPath := C.CString(modelPath)
defer C.free(unsafe.Pointer(cmodelPath))
internal := C.vosk_batch_model_new(cmodelPath)
model := &VoskBatchModel{model: internal}
return model, nil
}
func (m *VoskBatchModel) Free() {
C.vosk_batch_model_free(m.model)
}
func (m *VoskBatchModel) Wait() {
C.vosk_batch_model_wait(m.model);
}
func freeBatchModel(model *VoskBatchModel) {
C.vosk_batch_model_free(model.model)
}
// VoskBatchRecognizer contains a reference to the C VoskBatchRecognizer
type VoskBatchRecognizer struct {
rec *C.struct_VoskBatchRecognizer
}
func freeBatchRecognizer(recognizer *VoskBatchRecognizer) {
C.vosk_batch_recognizer_free(recognizer.rec)
}
func (r *VoskBatchRecognizer) Free() {
C.vosk_batch_recognizer_free(r.rec)
}
// NewBatchRecognizer creates a new VoskBatchRecognizer instance
func NewBatchRecognizer(model *VoskBatchModel, sampleRate float64) (*VoskBatchRecognizer, error) {
internal := C.vosk_batch_recognizer_new(model.model, C.float(sampleRate))
rec := &VoskBatchRecognizer{rec: internal}
return rec, nil
}
// AcceptWaveform accepts and processes a new chunk of the voice data.
func (r *VoskBatchRecognizer) AcceptWaveform(buffer []byte) {
cbuf := C.CBytes(buffer)
defer C.free(cbuf)
C.vosk_batch_recognizer_accept_waveform(r.rec, (*C.char)(cbuf), C.int(len(buffer)))
}
/** Set NLSML output
* @param nlsml - boolean value
*/
//void vosk_batch_recognizer_set_nlsml(VoskBatchRecognizer *recognizer, int nlsml);
func (r *VoskBatchRecognizer) SetNlsml(nlsml int) {
C.vosk_batch_recognizer_set_nlsml(r.rec, C.int(nlsml))
}
/** Closes the stream */
//void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer);
func (r *VoskBatchRecognizer) FinishStream() {
C.vosk_batch_recognizer_finish_stream(r.rec)
}
/** Return results */
//const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer);
func (r *VoskBatchRecognizer) FrontResult() string {
return C.GoString(C.vosk_batch_recognizer_front_result(r.rec))
}
/** Release and free first retrieved result */
//void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
func (r *VoskBatchRecognizer) Pop() {
C.vosk_batch_recognizer_pop(r.rec)
}
/** Get amount of pending chunks for more intelligent waiting */
//int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
func (r *VoskBatchRecognizer) GetPendingChunks() int {
i := C.vosk_batch_recognizer_get_pending_chunks(r.rec)
return int(i)
}

View File

@ -0,0 +1,5 @@
This example expects a `s16le` converted audio file and converts it to text in a
manner that imitates the Python example of [test_gpu_batch.py](../python/example/test_gpu_batch.py).
Note that the `libvosk.so` must be in the library path. This was successfully tested on
Ubuntu 24.04 with Go 1.18, gcc-11, NVIDIA driver 570.172.08.

View File

@ -0,0 +1,54 @@
package main
import (
"flag"
"fmt"
"io"
"log"
"os"
vosk "github.com/alphacep/vosk-api/go"
)
func main() {
var filename string
flag.StringVar(&filename, "f", "", "file to transcribe")
flag.Parse()
vosk.GPUInit()
model, err := vosk.NewBatchModel("model")
if err != nil {
log.Fatal(err)
}
rec, err := vosk.NewBatchRecognizer(model, 16000.0)
if err != nil {
log.Fatal(err)
}
file, err := os.Open(filename)
if err != nil {
panic(err)
}
defer file.Close()
buf := make([]byte, 8000)
for {
if _, err := file.Read(buf); err != nil {
if err != io.EOF {
log.Fatal(err)
}
break
}
rec.AcceptWaveform(buf)
model.Wait()
if rec.FrontResult() != "" {
fmt.Println(rec.FrontResult())
rec.Pop()
}
}
// Is this needed? rec.FinishStream()
}

View File

@ -16,6 +16,8 @@ func main() {
flag.StringVar(&filename, "f", "", "file to transcribe")
flag.Parse()
vosk.GPUInit()
model, err := vosk.NewModel("model")
if err != nil {
log.Fatal(err)

View File

@ -125,6 +125,16 @@ func (r *VoskRecognizer) SetPartialWords(words int) {
C.vosk_recognizer_set_partial_words(r.rec, C.int(words))
}
// SetEndpointerDelays sets the recognition timeouts, where startMax
// is the timeout for stopping recognition in case of initial silence
// (usually around 5), end is the timeout for stopping recognition
// in milliseconds after we recognized something (usually around 0.5-1.0),
// and max is the timeout for forcing utterance end in milliseconds
// (usually around 20-30).
func (r *VoskRecognizer) SetEndpointerDelays(startMax, end, max float64) {
C.vosk_recognizer_set_endpointer_delays(r.rec, C.float(startMax), C.float(end), C.float(max))
}
// AcceptWaveform accepts and processes a new chunk of the voice data.
func (r *VoskRecognizer) AcceptWaveform(buffer []byte) int {
cbuf := C.CBytes(buffer)

View File

@ -11,5 +11,5 @@ repositories {
}
dependencies {
implementation group: 'com.alphacephei', name: 'vosk', version: '0.3.45'
implementation group: 'com.alphacephei', name: 'vosk', version: '0.3.75'
}

View File

@ -16,7 +16,7 @@ repositories {
archivesBaseName = 'vosk'
group = 'com.alphacephei'
version = '0.3.45'
version = '0.3.75'
mavenPublish {
group = 'com.alphacephei'
@ -25,7 +25,7 @@ mavenPublish {
}
dependencies {
api group: 'net.java.dev.jna', name: 'jna', version: '5.13.0'
api group: 'net.java.dev.jna', name: 'jna', version: '5.18.1'
testImplementation 'junit:junit:4.13'
}

View File

@ -82,8 +82,18 @@ public class LibVosk {
public static native void vosk_recognizer_reset(Pointer recognizer);
public static native void vosk_recognizer_set_endpointer_mode(Pointer recognizer, int mode);
public static native void vosk_recognizer_set_endpointer_delays(Pointer recognizer, float t_start_max, float t_end, float t_max);
public static native void vosk_recognizer_free(Pointer recognizer);
public static native Pointer vosk_text_processor_new(String verbalizer, String tagger);
public static native void vosk_text_processor_free(Pointer processor);
public static native String vosk_text_processor_itn(Pointer processor, String input);
/**
* Set log level for Kaldi messages.
*

View File

@ -236,6 +236,34 @@ public class Recognizer extends PointerType implements AutoCloseable {
LibVosk.vosk_recognizer_reset(this.getPointer());
}
/**
* Endpointer delay mode
*/
public class EndpointerMode {
public static final int DEFAULT = 0;
public static final int SHORT = 1;
public static final int LONG = 2;
public static final int VERY_LONG = 3;
}
/**
* Configures endpointer mode for recognizer
*/
public void setEndpointerMode(int mode) {
LibVosk.vosk_recognizer_set_endpointer_mode(this.getPointer(), mode);
}
/**
* Set endpointer delays
*
* @param t_start_max timeout for stopping recognition in case of initial silence (usually around 5.0)
* @param t_end timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
* @param t_max timeout for forcing utterance end in milliseconds (usually around 20-30)
**/
public void setEndpointerDelays(float t_start_max, float t_end, float t_max) {
LibVosk.vosk_recognizer_set_endpointer_delays(this.getPointer(), t_start_max, t_end, t_max);
}
/**
* Releases recognizer object.
* Underlying model is also unreferenced and if needed, released.

View File

@ -15,8 +15,10 @@ import javax.sound.sampled.UnsupportedAudioFileException;
import org.vosk.LogLevel;
import org.vosk.Recognizer;
import org.vosk.Recognizer.EndpointerMode;
import org.vosk.LibVosk;
import org.vosk.Model;
import org.vosk.TextProcessor;
public class DecoderTest {
@ -95,9 +97,24 @@ public class DecoderTest {
Assert.assertTrue(true);
}
@Test
public void decoderEndpointerDelays() throws IOException, UnsupportedAudioFileException {
try (Model model = new Model("model");
Recognizer recognizer = new Recognizer(model, 16000)) {
recognizer.setEndpointerMode(EndpointerMode.VERY_LONG);
recognizer.setEndpointerDelays(5.0f, 3.0f, 50.0f);
}
Assert.assertTrue(true);
}
@Test(expected = IOException.class)
public void decoderTestException() throws IOException {
Model model = new Model("model_missing");
}
@Test
public void testItn() throws IOException {
TextProcessor p = new TextProcessor("model/itn/en_itn_tagger.fst", "model/itn/en_itn_verbalizer.fst");
System.out.println(p.itn("as easy as one two three"));
}
}

View File

@ -1,5 +1,6 @@
import org.jetbrains.dokka.gradle.DokkaTask
import org.jetbrains.kotlin.config.JvmTarget
import org.jetbrains.kotlin.gradle.ExperimentalKotlinGradlePluginApi
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
/*
* Copyright 2020 Alpha Cephei Inc. & Doomsdayrs
@ -18,15 +19,15 @@ import org.jetbrains.kotlin.config.JvmTarget
*/
plugins {
kotlin("multiplatform") version "1.8.10"
kotlin("multiplatform") version "2.0.0"
id("com.android.library")
`maven-publish`
id("org.jetbrains.dokka") version "1.7.20"
kotlin("plugin.serialization") version "1.8.10"
id("org.jetbrains.dokka") version "1.9.20"
kotlin("plugin.serialization") version "2.0.0"
}
group = "com.alphacephei"
version = "0.4.0-alpha0"
version = "0.3.75"
repositories {
google()
@ -67,9 +68,11 @@ fun org.jetbrains.kotlin.gradle.dsl.KotlinMultiplatformExtension.native(
kotlin {
jvm {
compilations.all {
kotlinOptions.jvmTarget = JvmTarget.JVM_11.description
@OptIn(ExperimentalKotlinGradlePluginApi::class)
compilerOptions {
jvmTarget.set(JvmTarget.JVM_17)
}
testRuns["test"].executionTask.configure {
useJUnitPlatform()
environment("MODEL", "VOSK_MODEL")
@ -80,7 +83,7 @@ kotlin {
}
}
android {
androidTarget {
publishAllLibraryVariants()
}
@ -102,6 +105,16 @@ kotlin {
}
}
@OptIn(ExperimentalKotlinGradlePluginApi::class)
applyDefaultHierarchyTemplate {
withJvm()
withAndroidTarget()
if (enableNative)
withNative()
}
publishing {
publications {
withType<MavenPublication> {
@ -130,13 +143,13 @@ kotlin {
}
}
val jna_version = "5.13.0"
val coroutines_version = "1.6.4"
val jna_version = "5.14.0"
val coroutines_version = "1.7.3"
sourceSets {
val commonMain by getting {
dependencies {
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.4.1")
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.7.0")
api("org.jetbrains.kotlinx:kotlinx-coroutines-core:$coroutines_version")
}
}
@ -161,7 +174,7 @@ kotlin {
api("net.java.dev.jna:jna:$jna_version@aar")
}
}
val androidTest by getting {
val androidUnitTest by getting {
dependencies {
implementation("junit:junit:4.13.2")
}
@ -170,15 +183,16 @@ kotlin {
}
android {
compileSdk = 33
namespace = "com.alphacephei.library"
compileSdk = 34
sourceSets["main"].manifest.srcFile("src/androidMain/AndroidManifest.xml")
defaultConfig {
minSdk = 24
targetSdk = 33
targetSdk = 34
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_11
targetCompatibility = JavaVersion.VERSION_11
sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17
}
publishing {
multipleVariants {

View File

@ -23,7 +23,7 @@ pluginManagement {
resolutionStrategy {
eachPlugin {
if (requested.id.namespace == "com.android") {
useModule("com.android.tools.build:gradle:7.3.0")
useModule("com.android.tools.build:gradle:8.3.0")
}
}
}

View File

@ -0,0 +1,27 @@
/*
* Copyright 2024 Alpha Cephei Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.vosk
/**
* VoskEpMode
*/
enum class EndPointerMode {
ANSWER_DEFAULT,
ANSWER_SHORT,
ANSWER_LONG,
ANSWER_VERY_LONG
}

View File

@ -151,6 +151,7 @@ expect class Recognizer : Freeable {
* }],
* </pre>
*
* C equivalent = vosk_recognizer_set_words
* @param words - boolean value
*/
fun setOutputWordTimes(words: Boolean)
@ -168,6 +169,23 @@ expect class Recognizer : Freeable {
*/
fun setNLSML(nlsml: Boolean)
/**
* Set endpointer scaling factor
*
* @param mode Endpointer mode
**/
fun setEndPointerMode(mode: EndPointerMode)
/**
* Set endpointer delays
*
* @param tStartMax timeout for stopping recognition in case of initial silence (usually around 5.0)
* @param tEnd timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
* @param tMax timeout for forcing utterance end in milliseconds (usually around 20-30)
**/
fun setEndPointerDelays(tStartMax: Float, tEnd: Float, tMax: Float)
/**
* Accept voice data
*

View File

@ -0,0 +1,32 @@
/*
* Copyright 2024 Alpha Cephei Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.vosk
/**
* Inverse text normalization
*
* @since 2024/06/19
* @constructor Create text processor
*/
expect class TextProcessor constructor(tagger: Char, verbalizer: Char) : Freeable {
/** Release text processor */
override fun free()
/** Convert string */
fun itn(input: Char): Char
}

View File

@ -19,4 +19,4 @@ package org.vosk.exception
/**
* Internal common IO exception. On JVM this is just a type alias.
*/
expect open class IOException(message: String? = null) : Exception
expect open class IOException(message: String?) : Exception

View File

@ -42,14 +42,13 @@ internal object LibVosk {
@Throws(IOException::class)
private fun unpackDll(targetDir: File, lib: String) {
val source: InputStream =
Vosk::class.java.getResourceAsStream("/win32-x86-64/$lib.dll")!!
Files.copy(
source,
File(targetDir, "$lib.dll").toPath(),
StandardCopyOption.REPLACE_EXISTING
)
Vosk::class.java.getResourceAsStream("/win32-x86-64/$lib.dll")!!.use {
Files.copy(
it,
File(targetDir, "$lib.dll").toPath(),
StandardCopyOption.REPLACE_EXISTING
)
}
}
init {
@ -57,6 +56,7 @@ internal object LibVosk {
Platform.isAndroid() -> {
Native.register(LibVosk::class.java, "vosk")
}
Platform.isWindows() -> {
// We have to unpack dependencies
try {
@ -79,6 +79,7 @@ internal object LibVosk {
Native.register(LibVosk::class.java, "libvosk");
}
}
else -> {
Native.register(LibVosk::class.java, "vosk");
}
@ -194,4 +195,19 @@ internal object LibVosk {
external fun vosk_batch_recognizer_pop(recognizer: BatchRecognizer)
external fun vosk_batch_recognizer_get_pending_chunks(recognizer: BatchRecognizer): Int
external fun vosk_text_processor_new(tagger: Char, verbalizer: Char): Pointer
external fun vosk_text_processor_free(processor: TextProcessor)
external fun vosk_text_processor_itn(processor: TextProcessor, input: Char): Char
external fun vosk_recognizer_set_endpointer_mode(recognizer: Recognizer, ordinal: Int)
external fun vosk_recognizer_set_endpointer_delays(
recognizer: Recognizer,
tStartMax: Float,
tEnd: Float,
tMax: Float
)
}

View File

@ -327,4 +327,27 @@ actual class Recognizer : Freeable, PointerType, AutoCloseable {
free()
}
/**
* Set endpointer scaling factor
*
* @param mode Endpointer mode
**/
actual fun setEndPointerMode(mode: EndPointerMode) {
LibVosk.vosk_recognizer_set_endpointer_mode(this, mode.ordinal)
}
/**
* Set endpointer delays
*
* @param tStartMax timeout for stopping recognition in case of initial silence (usually around 5.0)
* @param tEnd timeout for stopping recognition in milliseconds after we recognized something (usually around 0.5 - 1.0)
* @param tMax timeout for forcing utterance end in milliseconds (usually around 20-30)
**/
actual fun setEndPointerDelays(
tStartMax: Float,
tEnd: Float,
tMax: Float
) {
LibVosk.vosk_recognizer_set_endpointer_delays(this, tStartMax, tEnd, tMax)
}
}

View File

@ -0,0 +1,51 @@
/*
* Copyright 2024 Alpha Cephei Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.vosk
import com.sun.jna.PointerType
/**
* Inverse text normalization
*
* @since 2024/06/19
*/
actual class TextProcessor :
Freeable, PointerType, AutoCloseable {
/**
* Create text processor
*/
actual constructor(tagger: Char, verbalizer: Char) :
super(LibVosk.vosk_text_processor_new(tagger, verbalizer))
/** Release text processor */
actual override fun free() {
LibVosk.vosk_text_processor_free(this)
}
/** Convert string */
actual fun itn(input: Char): Char =
LibVosk.vosk_text_processor_itn(this, input)
/**
* @see free
*/
override fun close() {
free()
}
}

View File

@ -22,4 +22,4 @@ linkerOpts.linux = \
linkerOpts.linux_x64 = \
-L/usr/lib64/ \
-L/usr/local/lib64/
-L/usr/local/lib64/

View File

@ -69,16 +69,18 @@ const vosk_recognizer_ptr = ref.refType(vosk_recognizer);
let soname;
if (os.platform() == 'win32') {
// Update path to load dependent dlls
let currentPath = process.env.Path;
let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86_64"));
process.env.Path = dllDirectory + path.delimiter + currentPath;
// Update path to load dependent dlls
let currentPath = process.env.Path;
let dllDirectory = path.resolve(path.join(__dirname, 'lib', 'win-x86_64'));
process.env.Path = dllDirectory + path.delimiter + currentPath;
soname = path.join(__dirname, "lib", "win-x86_64", "libvosk.dll")
soname = path.join(__dirname, 'lib', 'win-x86_64', 'libvosk.dll');
} else if (os.platform() == 'darwin') {
soname = path.join(__dirname, "lib", "osx-universal", "libvosk.dylib")
soname = path.join(__dirname, 'lib', 'osx-universal', 'libvosk.dylib');
} else if (os.platform() == 'linux' && os.arch() == 'arm64') {
soname = path.join(__dirname, 'lib', 'linux-arm64', 'libvosk.so');
} else {
soname = path.join(__dirname, "lib", "linux-x86_64", "libvosk.so")
soname = path.join(__dirname, 'lib', 'linux-x86_64', 'libvosk.so');
}
const libvosk = ffi.Library(soname, {
@ -128,6 +130,9 @@ class Model {
* @type {unknown}
*/
this.handle = libvosk.vosk_model_new(modelPath);
if (!this.handle) {
throw new Error('Failed to create a model.');
}
}
/**
@ -161,6 +166,9 @@ class SpeakerModel {
* @type {unknown}
*/
this.handle = libvosk.vosk_spk_model_new(modelPath);
if (!this.handle) {
throw new Error('Failed to create a speaker model.');
}
}
/**
@ -235,6 +243,10 @@ class Recognizer {
: hasOwnProperty(param, 'grammar')
? libvosk.vosk_recognizer_new_grm(model.handle, sampleRate, JSON.stringify(param.grammar))
: libvosk.vosk_recognizer_new(model.handle, sampleRate);
if (!this.handle) {
throw new Error('Failed to create a recognizer.');
}
}
/**

View File

@ -1,6 +1,6 @@
{
"name": "vosk",
"version": "0.3.45",
"version": "0.3.75",
"description": "Node binding for continuous offline voice recoginition with Vosk library.",
"repository": {
"type": "git",

11
python/example/test_itn.py Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env python3
import wave
import sys
from vosk import Processor
proc = Processor("ru_itn_tagger.fst", "ru_itn_verbalizer.fst")
print (proc.process("у нас десять яблок"))
print (proc.process("у нас десять яблок и десять миллилитров воды точка"))
print (proc.process("мы пришли в восемь часов пять минут"))

View File

@ -45,7 +45,7 @@ with open("README.md", "rb") as fh:
setuptools.setup(
name="vosk",
version="0.3.46",
version="0.3.75",
author="Alpha Cephei Inc",
author_email="contact@alphacephei.com",
description="Offline open source speech recognition API based on Kaldi and Vosk",

View File

@ -28,7 +28,7 @@ def recognize(line):
def main():
p = Pool(8)
texts = p.map(recognize, open(sys.argv[1], encoding="uft-8").readlines())
texts = p.map(recognize, open(sys.argv[1], encoding="utf-8").readlines())
print ("\n".join(texts))
main()

View File

@ -287,3 +287,17 @@ class BatchRecognizer:
def GetPendingChunks(self):
return _c.vosk_batch_recognizer_get_pending_chunks(self._handle)
class Processor:
def __init__(self, *args):
self._handle = _c.vosk_text_processor_new(args[0].encode('utf-8'), args[1].encode('utf-8'))
if self._handle == _ffi.NULL:
raise Exception("Failed to create processor")
def __del__(self):
_c.vosk_text_processor_free(self._handle)
def process(self, text):
return _ffi.string(_c.vosk_text_processor_itn(self._handle, text.encode('utf-8'))).decode('utf-8')

View File

@ -1,6 +1,6 @@
Gem::Specification.new do |s|
s.name = "vosk"
s.version = "0.3.45"
s.version = "0.3.75"
s.summary = "Offline speech recognition API"
s.description = "Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi, Czech, Polish. More to come."
s.authors = ["Alpha Cephei Inc"]

View File

@ -23,17 +23,18 @@ VOSK_SOURCES= \
language_model.cc \
model.cc \
spk_model.cc \
vosk_api.cc
vosk_api.cc \
postprocessor.cc
VOSK_HEADERS= \
recognizer.h \
language_model.h \
model.h \
spk_model.h \
vosk_api.h
vosk_api.h \
postprocessor.h
CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LINKING \
-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS)
CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LINKING -I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS)
LDFLAGS=

View File

@ -46,9 +46,6 @@ void LanguageModelEstimator::AddCounts(const std::vector<int32> &sentence) {
void LanguageModelEstimator::IncrementCount(const std::vector<int32> &history,
int32 next_phone) {
int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history);
if (lm_states_[lm_state_index].tot_count == 0) {
num_active_lm_states_++;
}
lm_states_[lm_state_index].AddCount(next_phone, 1);
}
@ -106,6 +103,7 @@ int32 LanguageModelEstimator::FindOrCreateLmStateIndexForHistory(
int32 backoff_lm_state = FindOrCreateLmStateIndexForHistory(backoff_hist);
lm_states_[ans].backoff_lmstate_index = backoff_lm_state;
}
num_active_lm_states_++;
return ans;
}
@ -156,12 +154,13 @@ int32 LanguageModelEstimator::FindInitialFstState() const {
void LanguageModelEstimator::OutputToFst(
int32 num_states,
fst::StdVectorFst *fst) const {
fst::StdVectorFst *out_fst) const {
KALDI_ASSERT(num_states == num_active_lm_states_);
fst->DeleteStates();
fst::StdVectorFst fst;
for (int32 i = 0; i < num_states; i++)
fst->AddState();
fst->SetStart(FindInitialFstState());
fst.AddState();
fst.SetStart(FindInitialFstState());
int64 tot_count = 0;
double tot_logprob = 0.0;
@ -184,28 +183,29 @@ void LanguageModelEstimator::OutputToFst(
tot_count += count;
tot_logprob += logprob * count;
if (phone == 0) { // Go to final state
fst->SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob));
fst.SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob));
} else { // It becomes a transition.
std::vector<int32> next_history(lm_state.history);
next_history.push_back(phone);
int32 dest_lm_state = FindNonzeroLmStateIndexForHistory(next_history),
dest_fst_state = lm_states_[dest_lm_state].fst_state;
KALDI_ASSERT(dest_fst_state != -1);
fst->AddArc(lm_state.fst_state,
fst.AddArc(lm_state.fst_state,
fst::StdArc(phone, phone, fst::TropicalWeight(-logprob),
dest_fst_state));
}
}
if (lm_state.backoff_lmstate_index >= 0) {
fst->AddArc(lm_state.fst_state, fst::StdArc(0, 0, fst::TropicalWeight(-log(1 - opts_.discount)), lm_states_[lm_state.backoff_lmstate_index].fst_state));
fst.AddArc(lm_state.fst_state, fst::StdArc(0, 0, fst::TropicalWeight(-log(1 - opts_.discount)), lm_states_[lm_state.backoff_lmstate_index].fst_state));
}
}
fst::Connect(fst);
// Make sure that Connect does not delete any states.
int32 num_states_connected = fst->NumStates();
KALDI_ASSERT(num_states_connected == num_states);
fst::DeterminizeOptions<fst::StdArc> opts;
fst::Determinize(fst, out_fst, opts);
fst::Connect(out_fst);
// arc-sort. ilabel or olabel doesn't matter, it's an acceptor.
fst::ArcSort(fst, fst::ILabelCompare<fst::StdArc>());
KALDI_LOG << "Created language model with " << num_states
<< " states and " << fst::NumArcs(*fst) << " arcs.";
fst::ArcSort(out_fst, fst::ILabelCompare<fst::StdArc>());
KALDI_LOG << "Created language model with " << out_fst->NumStates()
<< " states and " << fst::NumArcs(*out_fst) << " arcs.";
KALDI_LOG << "Originally language model with " << fst.NumStates()
<< " states and " << fst::NumArcs(fst) << " arcs.";
}

66
src/postprocessor.cc Normal file
View File

@ -0,0 +1,66 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "postprocessor.h"
using fst::TokenType;
Processor::Processor(const std::string& tagger_path,
const std::string& verbalizer_path) {
tagger_.reset(StdVectorFst::Read(tagger_path));
verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
compiler_ = std::make_shared<StringCompiler<StdArc>>(TokenType::BYTE);
printer_ = std::make_shared<StringPrinter<StdArc>>(TokenType::BYTE);
}
std::string Processor::ShortestPath(const StdVectorFst& lattice) {
StdVectorFst shortest_path;
fst::ShortestPath(lattice, &shortest_path, 1, true);
std::string output;
printer_->operator()(shortest_path, &output);
return output;
}
std::string Processor::Compose(const std::string& input,
const StdVectorFst* fst) {
StdVectorFst input_fst;
compiler_->operator()(input, &input_fst);
StdVectorFst lattice;
fst::Compose(input_fst, *fst, &lattice);
return ShortestPath(lattice);
}
std::string Processor::Tag(const std::string& input) {
if (input.empty()) {
return "";
}
return Compose(input, tagger_.get());
}
std::string Processor::Verbalize(const std::string& input) {
if (input.empty()) {
return "";
}
std::string output = Compose(input, verbalizer_.get());
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
return output;
}
std::string Processor::Normalize(const std::string& input) {
return Verbalize(Tag(input));
}

45
src/postprocessor.h Normal file
View File

@ -0,0 +1,45 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PROCESSOR_WETEXT_PROCESSOR_H_
#define PROCESSOR_WETEXT_PROCESSOR_H_
#include <memory>
#include <string>
#include "fst/fstlib.h"
using fst::StdArc;
using fst::StdVectorFst;
using fst::StringCompiler;
using fst::StringPrinter;
class Processor {
public:
Processor(const std::string& tagger_path, const std::string& verbalizer_path);
std::string Tag(const std::string& input);
std::string Verbalize(const std::string& input);
std::string Normalize(const std::string& input);
private:
std::string ShortestPath(const StdVectorFst& lattice);
std::string Compose(const std::string& input, const StdVectorFst* fst);
std::shared_ptr<StdVectorFst> tagger_ = nullptr;
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
};
#endif // PROCESSOR_WETEXT_PROCESSOR_H_

View File

@ -247,8 +247,8 @@ void Recognizer::SetEndpointerDelays(float t_start_max, float t_end, float t_max
rule1 = t_start_max;
rule2 = t_end;
rule3 = t_end * 1.5;
rule4 = t_end * 2;
rule3 = t_end + 0.5;
rule4 = t_end + 1.0;
rule5 = t_max;
KALDI_LOG << "Updating endpointer delays " << rule1 << "," << rule2 << "," << rule3 << "," << rule4 << "," << rule5;
@ -275,7 +275,7 @@ void Recognizer::SetSpkModel(SpkModel *spk_model)
void Recognizer::SetGrm(char const *grammar)
{
if (state_ == RECOGNIZER_RUNNING) {
KALDI_ERR << "Can't add speaker model to already running recognizer";
KALDI_ERR << "Can't add grammar to already running recognizer";
return;
}
@ -355,6 +355,7 @@ void Recognizer::UpdateGrammarFst(char const *grammar)
}
estimator.AddCounts(sentence);
}
delete g_fst_;
g_fst_ = new StdVectorFst();
estimator.Estimate(g_fst_);
@ -698,6 +699,17 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
obj["alternatives"].append(entry);
}
if (spk_model_) {
Vector<BaseFloat> xvector;
int num_spk_frames;
if (GetSpkVector(xvector, &num_spk_frames)) {
for (int i = 0; i < xvector.Dim(); i++) {
obj["spk"].append(xvector(i));
}
obj["spk_frames"] = num_spk_frames;
}
}
return StoreReturn(obj.dump());
}

View File

@ -17,6 +17,7 @@
#include "recognizer.h"
#include "model.h"
#include "spk_model.h"
#include "postprocessor.h"
#if HAVE_CUDA
#include "cudamatrix/cu-device.h"
@ -304,3 +305,28 @@ int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer)
return 0;
#endif
}
VoskTextProcessor *vosk_text_processor_new(const char *tagger, const char *verbalizer)
{
try {
return (VoskTextProcessor *)new Processor(tagger, verbalizer);
} catch (...) {
return nullptr;
}
}
void vosk_text_processor_free(VoskTextProcessor *processor)
{
delete ((Processor *)processor);
}
char *vosk_text_processor_itn(VoskTextProcessor *processor, const char *input)
{
Processor *wprocessor = (Processor *)processor;
std::string sinput(input);
std::string tagged_text = wprocessor->Tag(sinput);
std::string normalized_text = wprocessor->Verbalize(tagged_text);
return strdup(normalized_text.c_str());
}

View File

@ -39,6 +39,8 @@ typedef struct VoskSpkModel VoskSpkModel;
* speaker information and so on */
typedef struct VoskRecognizer VoskRecognizer;
/** Inverse text normalization */
typedef struct VoskTextProcessor VoskTextProcessor;
/**
* Batch model object
@ -376,6 +378,15 @@ void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
/** Get amount of pending chunks for more intelligent waiting */
int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
/** Create text processor */
VoskTextProcessor *vosk_text_processor_new(const char *tagger, const char *verbalizer);
/** Release text processor */
void vosk_text_processor_free(VoskTextProcessor *processor);
/** Convert string */
char *vosk_text_processor_itn(VoskTextProcessor *processor, const char *input);
#ifdef __cplusplus
}
#endif

View File

@ -1,3 +1,120 @@
A proper simple setup to train a Vosk model
# Vosk API Training
More documentation later
This directory contains scripts and tools for training speech recognition models using the Kaldi toolkit.
## Table of Contents
1. [Overview](#overview)
2. [Directory Structure](#directory-structure)
3. [Installation](#installation)
4. [Training Process](#training-process)
- [Data Preparation](#data-preparation)
- [Dictionary Preparation](#dictionary-preparation)
- [MFCC Feature Extraction](#mfcc-feature-extraction)
- [Acoustic Model Training](#acoustic-model-training)
- [TDNN Chain Model Training](#tdnn-chain-model-training)
- [Decoding](#decoding)
5. [Results](#results)
6. [Contributing](#contributing)
## Overview
This repository provides tools for training custom speech recognition models using Kaldi. It supports acoustic model training, language model creation, and decoding pipelines.
## Directory Structure
```plaintext
.
├── cmd.sh # Command configuration for training and decoding
├── conf/
│ ├── mfcc.conf # Configuration for MFCC feature extraction
│ └── online_cmvn.conf # Online Cepstral Mean Variance Normalization (currently empty)
├── local/
│ ├── chain/
│ │ ├── run_ivector_common.sh # Script for i-vector extraction during chain model training
│ │ └── run_tdnn.sh # Script for training a TDNN model
│ ├── data_prep.sh # Data preparation script for creating Kaldi data directories
│ ├── download_and_untar.sh # Script for downloading and extracting datasets
│ ├── download_lm.sh # Downloads language models
│ ├── prepare_dict.sh # Prepares the pronunciation dictionary
│ └── score.sh # Scoring script for evaluation
├── path.sh # Script for setting Kaldi paths
├── RESULTS # Script for printing the best WER results
├── RESULTS.txt # Contains WER results from decoding
├── run.sh # Main script for the entire training pipeline
├── steps -> ../../wsj/s5/steps/ # Link to Kaldis WSJ steps for acoustic model training
└── utils -> ../../wsj/s5/utils/ # Link to Kaldis utility scripts
```
### Key Files:
- **cmd.sh**: Defines commands for running training and decoding tasks.
- **path.sh**: Sets up paths for Kaldi binaries and scripts.
- **run.sh**: Main entry point for the training pipeline, running tasks in stages.
- **RESULTS**: Displays Word Error Rate (WER) for the trained models.
## Installation
### Prerequisites
- [Kaldi](https://github.com/kaldi-asr/kaldi): Kaldi toolkit must be installed and configured.
- Required tools: `ffmpeg`, `sox`, `sctk` for data preparation and scoring.
### Steps
1. Clone the Vosk API repository.
2. Install Kaldi and ensure the `KALDI_ROOT` is correctly set in `path.sh`.
3. Set environment variables using `cmd.sh` and `path.sh`.
## Training Process
### Data Preparation
Run the data preparation stage in `run.sh`:
```bash
bash run.sh --stage 0 --stop_stage 0
```
This stage downloads and prepares the LibriSpeech dataset.
### Dictionary Preparation
Prepare the pronunciation dictionary with:
```bash
bash run.sh --stage 1 --stop_stage 1
```
This step generates the necessary files for Kaldi's `prepare_lang.sh` script.
### MFCC Feature Extraction
Run the MFCC extraction process:
```bash
bash run.sh --stage 2 --stop_stage 2
```
This step extracts Mel-frequency cepstral coefficients (MFCC) features and computes Cepstral Mean Variance Normalization (CMVN).
### Acoustic Model Training
Train monophone, LDA+MLLT, and SAT models:
```bash
bash run.sh --stage 3 --stop_stage 3
```
This stage trains GMM-based models and aligns the data for TDNN training.
### TDNN Chain Model Training
Train a Time-Delay Neural Network (TDNN) chain model:
```bash
bash run.sh --stage 4 --stop_stage 4
```
The chain model uses i-vectors for speaker adaptation.
### Decoding
After training, decode the test data:
```bash
bash run.sh --stage 5 --stop_stage 5
```
This step decodes using the trained model and evaluates the Word Error Rate (WER).
## Results
WER can be evaluated by running:
```bash
bash RESULTS
```
Example of `RESULTS.txt`:
```plaintext
%WER 14.10 [ 2839 / 20138, 214 ins, 487 del, 2138 sub ] exp/chain/tdnn/decode_test/wer_11_0.0
%WER 12.67 [ 2552 / 20138, 215 ins, 406 del, 1931 sub ] exp/chain/tdnn/decode_test_rescore/wer_11_0.0
```

View File

@ -44,6 +44,7 @@ RUN cd /opt \
&& cd /opt/kaldi/src \
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
&& make -j 10 online2 lm rnnlm \
&& make -j 10 online2 rnnlm \
&& find /opt/kaldi -name "*.o" -exec rm {} \;

View File

@ -35,5 +35,5 @@ RUN cd /opt \
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
&& make -j 10 online2 lm rnnlm \
&& make -j 10 online2 rnnlm \
&& find /opt/kaldi -name "*.o" -exec rm {} \;

View File

@ -0,0 +1,50 @@
ARG DOCKCROSS_IMAGE=alphacep/dockcross-linux-armv7
FROM ${DOCKCROSS_IMAGE}
LABEL description="A docker image for building portable Python linux binary wheels and Kaldi on other architectures"
LABEL maintainer="contact@alphacephei.com"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
libffi-dev \
libpcre3-dev \
zlib1g-dev \
automake \
autoconf \
libtool \
cmake \
python3 \
python3-pip \
python3-wheel \
python3-setuptools \
python3-cffi \
&& rm -rf /var/lib/apt/lists/*
ARG OPENBLAS_ARGS=
RUN cd /opt \
&& git clone -b vosk --single-branch https://github.com/alphacep/kaldi \
&& cd kaldi/tools \
&& git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS \
&& git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack \
&& echo ${OPENBLAS_ARGS} \
&& make -C OpenBLAS ONLY_CBLAS=1 ${OPENBLAS_ARGS} HOSTCC=gcc USE_LOCKING=1 USE_THREAD=0 all \
&& make -C OpenBLAS ${OPENBLAS_ARGS} HOSTCC=gcc USE_LOCKING=1 USE_THREAD=0 PREFIX=$(pwd)/OpenBLAS/install install \
&& mkdir -p clapack/BUILD && cd clapack/BUILD && cmake .. \
&& make -j 10 -C F2CLIBS \
&& make -j 10 -C BLAS \
&& make -j 10 -C SRC \
&& find . -name "*.a" | xargs cp -t ../../OpenBLAS/install/lib \
&& cd /opt/kaldi/tools \
&& git clone --single-branch https://github.com/alphacep/openfst openfst \
&& cd openfst \
&& autoreconf -i \
&& CFLAGS="-g -O3" ./configure --prefix=/opt/kaldi/tools/openfst --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --disable-bin --host=${CROSS_TRIPLE} --build=x86-linux-gnu \
&& make -j 10 && make install \
&& cd /opt/kaldi/src \
&& sed -i "s:TARGET_ARCH=\"\`uname -m\`\":TARGET_ARCH=$(echo $CROSS_TRIPLE|cut -d - -f 1):g" configure \
&& sed -i "s: -O1 : -O3 :g" makefiles/linux_openblas_arm.mk \
&& sed -i "s:-DHAVE_EXECINFO_H=1::g" makefiles/linux_openblas_arm.mk \
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
&& make -j 10 online2 rnnlm \
&& find /opt/kaldi -name "*.o" -exec rm {} \;

View File

@ -30,5 +30,5 @@ RUN cd /opt \
&& ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \
&& sed -i 's:-msse -msse2:-msse -msse2:g' kaldi.mk \
&& sed -i 's: -O1 : -O3 :g' kaldi.mk \
&& make -j $(nproc) online2 lm rnnlm \
&& make -j $(nproc) online2 rnnlm \
&& find /opt/kaldi -name "*.o" -exec rm {} \;

View File

@ -27,5 +27,5 @@ RUN cd /opt \
&& ./configure --mathlib=MKL --shared --use-cuda=no \
&& sed -i 's:-msse -msse2:-msse -msse2 -mavx -mavx2:g' kaldi.mk \
&& sed -i 's: -O1 : -O3 :g' kaldi.mk \
&& make -j $(nproc) online2 lm rnnlm \
&& make -j $(nproc) online2 rnnlm \
&& find /opt/kaldi -name "*.o" -exec rm {} \;

View File

@ -62,4 +62,4 @@ RUN cd /opt/kaldi \
--host=x86_64-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
&& make depend -j \
&& make -j $(nproc) online2 lm rnnlm
&& make -j $(nproc) online2 rnnlm

View File

@ -61,4 +61,4 @@ RUN cd /opt/kaldi \
--host=i686-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
&& make depend -j \
&& make -j $(nproc) online2 lm rnnlm
&& make -j $(nproc) online2 rnnlm

View File

@ -0,0 +1,70 @@
FROM ubuntu:20.04
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y --no-install-recommends \
ca-certificates \
g++ \
bzip2 \
unzip \
make \
wget \
git \
python3 \
python3-pip \
python3-wheel \
python3-setuptools \
python3-cffi \
zlib1g-dev \
patch \
cmake \
xz-utils \
automake \
autoconf \
libtool \
pkg-config \
sudo \
&& rm -rf /var/lib/apt/lists/*
RUN cd /opt && \
wget https://github.com/mstorsjo/llvm-mingw/releases/download/20240820/llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64.tar.xz \
&& tar xf llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64.tar.xz \
&& mv llvm-mingw-20240820-msvcrt-ubuntu-20.04-x86_64 llvm-mingw
ENV PATH="$PATH:/opt/llvm-mingw/bin"
RUN mkdir /opt/kaldi \
&& git clone https://github.com/alphacep/openfst \
&& cd openfst \
&& autoreconf -i \
&& CXX=aarch64-w64-mingw32-g++ CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" \
./configure --prefix=/opt/kaldi/local \
--enable-shared --enable-static --with-pic --disable-bin \
--enable-lookahead-fsts --enable-ngram-fsts --host=aarch64-w64-mingw32 \
&& make -j $(nproc) \
&& make install
RUN cd /opt/kaldi \
&& git clone -b v0.3.20 --single-branch https://github.com/xianyi/OpenBLAS \
&& cd OpenBLAS \
&& make HOSTCC=gcc CC=aarch64-w64-mingw32-gcc ONLY_CBLAS=1 USE_LOCKING=1 USE_THREAD=0 USE_OPENMP=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 -j $(nproc) \
&& make PREFIX=/opt/kaldi/local install
RUN cd /opt/kaldi \
&& git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack \
&& mkdir clapack/BUILD \
&& cd clapack/BUILD \
&& cmake -DCMAKE_C_COMPILER_TARGET=aarch64-w64-mingw32 -DCMAKE_C_COMPILER=aarch64-w64-mingw32-gcc -DCMAKE_SYSTEM_NAME=Windows -DCMAKE_CROSSCOMPILING=True .. \
&& make -C F2CLIBS/libf2c \
&& make -C BLAS \
&& make -C SRC \
&& find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
RUN cd /opt/kaldi \
&& git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
&& cd kaldi/src \
&& CXX=aarch64-w64-mingw32-g++ CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
--mathlib=OPENBLAS_CLAPACK \
--host=aarch64-w64-mingw32 --openblas-clapack-root=/opt/kaldi/local \
--fst-root=/opt/kaldi/local --fst-version=1.8.0 \
&& make depend -j \
&& make LLVM_BUILD=1 -j $(nproc) online2 rnnlm

7
travis/build-dockcross-musl.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
set -e
set -x
docker build --build-arg="DOCKCROSS_IMAGE=dockcross/linux-armv7l-musl" --build-arg="OPENBLAS_ARGS=TARGET=ARMV7" --file Dockerfile.dockcross-musl --tag alphacep/kaldi-dockcross-armv7-musl:latest .
docker run --rm -v /home/shmyrev/travis/vosk-api/:/io alphacep/kaldi-dockcross-armv7-musl /io/travis/build-wheels-dockcross.sh

View File

@ -0,0 +1,5 @@
#!/bin/bash
set -e -x
docker build --file Dockerfile.winaarch64 --tag alphacep/kaldi-winaarch64:latest .
docker run --rm -v `realpath ..`:/io alphacep/kaldi-winaarch64 /io/travis/build-wheels-winaarch64.sh

View File

@ -14,6 +14,11 @@ case $CROSS_TRIPLE in
export VOSK_MACHINE=armv7l
export VOSK_ARCHITECTURE=32bit
;;
*armv7l-linux-musleabihf*)
export VOSK_MACHINE=armv7l
export VOSK_ARCHITECTURE=32bit
export VOSK_VARIANT="-musl"
;;
*i686-*)
export VOSK_MACHINE=x86
export VOSK_ARCHITECTURE=32bit
@ -29,9 +34,9 @@ case $CROSS_TRIPLE in
esac
# Copy library to output folder
mkdir -p /io/wheelhouse/vosk-linux-$VOSK_MACHINE
cp /opt/vosk-api/src/*.so /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-linux-$VOSK_MACHINE
mkdir -p /io/wheelhouse/vosk-linux-${VOSK_MACHINE}${VOSK_VARIANT}
cp /opt/vosk-api/src/*.so /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-linux-$VOSK_MACHINE${VOSK_VARIANT}
# Build wheel
python3 -m pip install requests tqdm srt websockets wheel
python3 -m pip install requests tqdm srt websockets wheel --break-system-packages
python3 -m pip wheel /opt/vosk-api/python --no-deps -w /io/wheelhouse

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -e -x
# Build libvosk
cd /opt
git clone https://github.com/alphacep/vosk-api
cd vosk-api/src
EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=aarch64-w64-mingw32-g++ EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
# Copy dlls to output folder
mkdir -p /io/wheelhouse/vosk-winaarch64
cp /opt/vosk-api/src/*.{dll,lib} /opt/vosk-api/src/vosk_api.h /io/wheelhouse/vosk-winaarch64
# Build wheel and put to the output folder
export VOSK_SOURCE=/opt/vosk-api
export VOSK_SYSTEM=Windows
export VOSK_ARCHITECTURE=64bit
python3 -m pip -v wheel /opt/vosk-api/python --no-deps -w /io/wheelhouse

View File

@ -1,6 +1,6 @@
{
"name": "vosk-js",
"version": "0.3.45",
"version": "0.3.75",
"description": "Node binding for continuous voice recoginition through vosk-api.",
"repository": {
"type": "git",