Add a C++ example for simulated streaming ASR (#2607)

This commit is contained in:
Fangjun Kuang 2025-09-18 14:59:50 +08:00 committed by GitHub
parent 9102f34179
commit 86af28157b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 425 additions and 7 deletions

View File

@ -66,12 +66,17 @@ jobs:
./gitcompile
popd
p=$PWD
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_GPU=ON \
@ -113,13 +118,18 @@ jobs:
./gitcompile
popd
p=$PWD
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_INSTALL_PREFIX=./install \
..
@ -245,7 +255,7 @@ jobs:
file: sherpa-onnx-*linux-aarch64*.tar.bz2
# repo_name: k2-fsa/sherpa-onnx
# repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
# tag: v1.11.5
# tag: v1.12.13
- name: Test offline Moonshine
if: matrix.build_type != 'Debug'

View File

@ -67,11 +67,16 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_INSTALL_PREFIX=./install \
..

View File

@ -46,7 +46,15 @@ jobs:
echo "PWD"
ls -lh /project/alsa-lib/src/.libs
CIBW_ENVIRONMENT: CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR SHERPA_ONNX_MAKE_ARGS="VERBOSE=1" SHERPA_ONNX_ENABLE_ALSA=1 SHERPA_ONNX_ENABLE_GPU=ON
CIBW_ENVIRONMENT: >
CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
SHERPA_ONNX_ENABLE_ALSA=1
SHERPA_ONNX_ENABLE_GPU=ON
SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON -DALSA_INCLUDE_DIR=/project/alsa-lib/include -DALSA_LIBRARY=/project/alsa-lib/src/.libs/libasound.so"
CIBW_BUILD: "${{ matrix.python-version}}-* "
CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"
CIBW_BUILD_VERBOSITY: 3

View File

@ -109,6 +109,7 @@ jobs:
strings $PWD/alsa-lib/src/.libs/libasound.so.2.0.0 | grep "^GLIBC"
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
export SHERPA_ONNX_RKNN_TOOLKIT2_PATH=$PWD/rknn-toolkit2
@ -117,7 +118,7 @@ jobs:
export SHERPA_ONNX_ENABLE_ALSA=1
export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_RKNN=ON"
export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_RKNN=ON -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"
python3 setup.py bdist_wheel
mv dist wheelhouse

View File

@ -326,12 +326,13 @@ jobs:
CIBW_ENVIRONMENT: >
SHERPA_ONNX_SPLIT_PYTHON_PACKAGE=ON
C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
LD_LIBRARY_PATH=/project/build/bdist.linux-aarch64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
SHERPA_ONNX_ENABLE_ALSA=1
SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF"
SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"
CIBW_BUILD: "${{ matrix.python-version}}-* "
CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"

View File

@ -58,6 +58,7 @@ jobs:
shell: bash
run: |
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
export LD_LIBRARY_PATH=$SHERPA_ONNX_ALSA_LIB_DIR:$LD_LIBRARY_PATH
@ -66,9 +67,11 @@ jobs:
echo "---"
ls -lh $PWD/alsa-lib/src/.libs
p=$PWD
export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
export SHERPA_ONNX_ENABLE_ALSA=1
export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON"
export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"
onnxruntime_version=${{ matrix.onnxruntime_version }}
if [[ $onnxruntime_version == "1.22.0" ]]; then

View File

@ -82,6 +82,7 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
mkdir build
@ -326,11 +327,12 @@ jobs:
CIBW_ENVIRONMENT: >
SHERPA_ONNX_SPLIT_PYTHON_PACKAGE=ON
CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
SHERPA_ONNX_ENABLE_ALSA=1
SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF"
SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"
CIBW_BUILD: "${{ matrix.python-version}}-* "
CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"

View File

@ -82,12 +82,17 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-D CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-D CMAKE_INSTALL_PREFIX=./install \
-D BUILD_SHARED_LIBS=ON \

View File

@ -80,12 +80,16 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-D SHERPA_ONNX_ENABLE_TTS=ON \
-D CMAKE_BUILD_TYPE=Release \
-D BUILD_SHARED_LIBS=ON \

View File

@ -113,12 +113,17 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-D SHERPA_ONNX_ENABLE_TTS=ON \
-D CMAKE_BUILD_TYPE=Release \
-D BUILD_SHARED_LIBS=ON \

View File

@ -110,12 +110,17 @@ jobs:
popd
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
mkdir build
cd build
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-D SHERPA_ONNX_ENABLE_TTS=${{ matrix.with_tts }} \
-D CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} \

View File

@ -100,7 +100,9 @@ jobs:
strings $PWD/alsa-lib/src/.libs/libasound.so.2.0.0 | grep "^GLIBC"
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
p=$PWD
export SHERPA_ONNX_RKNN_TOOLKIT2_PATH=$PWD/rknn-toolkit2
export SHERPA_ONNX_RKNN_TOOLKIT2_LIB_DIR=$SHERPA_ONNX_RKNN_TOOLKIT2_PATH/rknpu2/runtime/Linux/librknn_api/aarch64
@ -114,6 +116,8 @@ jobs:
BUILD_SHARED_LIBS=${{ matrix.shared }}
cmake \
-DALSA_INCLUDE_DIR=$p/alsa-lib/include \
-DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_RKNN=ON \

View File

@ -310,7 +310,7 @@ if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME
Could not find alsa/asoundlib.h !
We won't build sherpa-onnx-alsa
To fix that, please do:
(1) sudo apt-get install alsa-utils libasound2-dev
(1) sudo apt-get install alsa-utils libasound2-dev pkg-config
(2) rm -rf build
(3) re-try
")

View File

@ -72,6 +72,7 @@ if [ ! -f alsa-lib/src/.libs/libasound.so ]; then
fi
export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
if [[ x"$BUILD_SHARED_LIBS" == x"" ]]; then

View File

@ -73,9 +73,11 @@ def get_binaries():
"sherpa-onnx-vad",
"sherpa-onnx-vad-microphone",
"sherpa-onnx-vad-microphone-offline-asr",
"sherpa-onnx-vad-microphone-simulated-streaming-asr",
"sherpa-onnx-vad-with-offline-asr",
"sherpa-onnx-vad-with-online-asr",
"sherpa-onnx-version",
"sherpa-onnx-pa-devs",
]
if enable_alsa():

View File

@ -29,6 +29,7 @@ function(download_portaudio)
# Always use static build
set(PA_BUILD_SHARED OFF CACHE BOOL "" FORCE)
set(PA_BUILD_STATIC ON CACHE BOOL "" FORCE)
set(PA_BUILD_EXAMPLES ON CACHE BOOL "" FORCE)
FetchContent_Declare(portaudio
URL
@ -50,6 +51,15 @@ function(download_portaudio)
endif()
add_subdirectory(${portaudio_SOURCE_DIR} ${portaudio_BINARY_DIR} EXCLUDE_FROM_ALL)
if(CMAKE_SYSTEM_NAME STREQUAL Linux)
if(PA_USE_ALSA)
message(STATUS "portaudio with ALSA")
else()
message(STATUS "portaudio without ALSA")
endif()
endif()
set_target_properties(pa_devs PROPERTIES OUTPUT_NAME "sherpa-onnx-pa-devs")
set_target_properties(portaudio_static PROPERTIES OUTPUT_NAME "sherpa-onnx-portaudio_static")
if(NOT WIN32)
@ -62,6 +72,11 @@ function(download_portaudio)
DESTINATION lib)
endif()
install(TARGETS
pa_devs
DESTINATION bin)
add_custom_target(build_pa_devs ALL DEPENDS pa_devs)
endfunction()
download_portaudio()

View File

@ -518,6 +518,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
microphone.cc
)
add_executable(sherpa-onnx-vad-microphone-simulated-streaming-asr
sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
microphone.cc
)
add_executable(sherpa-onnx-vad-with-offline-asr
sherpa-onnx-vad-with-offline-asr.cc
)
@ -548,6 +553,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
sherpa-onnx-microphone-offline-audio-tagging
sherpa-onnx-microphone-offline-speaker-identification
sherpa-onnx-vad-microphone
sherpa-onnx-vad-microphone-simulated-streaming-asr
sherpa-onnx-vad-microphone-offline-asr
sherpa-onnx-vad-with-offline-asr
sherpa-onnx-vad-with-online-asr

View File

@ -0,0 +1,73 @@
// sherpa-onnx/csrc/sherpa-display.h
//
// Copyright (c) 2025 Xiaomi Corporation
#pragma once
#include <stdlib.h>
#include <ctime>
#include <iomanip>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace sherpa_onnx {
class SherpaDisplay {
public:
void UpdateText(const std::string &text) { current_text_ = text; }
void FinalizeCurrentSentence() {
if (!current_text_.empty() &&
(current_text_[0] != ' ' || current_text_.size() > 1)) {
sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)});
}
}
void Display() const {
if (!sentences_.empty() || !current_text_.empty()) {
ClearScreen();
}
printf("=== Speech Recognition with Next-gen Kaldi ===\n");
printf("------------------------------\n");
if (!sentences_.empty()) {
int32_t i = 1;
for (const auto &p : sentences_) {
printf("[%s] %d. %s\n", p.first.c_str(), i, p.second.c_str());
i += 1;
}
printf("------------------------------\n");
}
if (!current_text_.empty()) {
printf("Recognizing: %s\n", current_text_.c_str());
}
}
private:
static void ClearScreen() {
#ifdef _MSC_VER
auto ret = system("cls");
#else
auto ret = system("clear");
#endif
(void)ret;
}
static std::string GetCurrentDateTime() {
std::ostringstream os;
auto t = std::time(nullptr);
auto tm = std::localtime(&t);
os << std::put_time(tm, "%Y-%m-%d %H:%M:%S");
return os.str();
}
private:
std::vector<std::pair<std::string, std::string>> sentences_;
std::string current_text_;
};
} // namespace sherpa_onnx

View File

@ -0,0 +1,268 @@
// sherpa-onnx/csrc/sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
//
// Copyright (c) 2025 Xiaomi Corporation
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <mutex> // NOLINT
#include <queue>
#include <string>
#include <vector>
#include "portaudio.h" // NOLINT
#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/microphone.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/resample.h"
#include "sherpa-onnx/csrc/sherpa-display.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"
#include "sherpa-onnx/csrc/wave-writer.h"
std::queue<std::vector<float>> samples_queue;
std::condition_variable condition_variable;
std::mutex mutex;
bool stop = false;
static int32_t RecordCallback(const void *input_buffer,
void * /*output_buffer*/,
unsigned long frames_per_buffer, // NOLINT
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void * /*user_data*/) {
std::lock_guard<std::mutex> lock(mutex);
samples_queue.emplace(
reinterpret_cast<const float *>(input_buffer),
reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
condition_variable.notify_one();
return stop ? paComplete : paContinue;
}
static void Handler(int32_t /*sig*/) {
stop = true;
condition_variable.notify_one();
fprintf(stdout, "\nCaught Ctrl + C. Exiting...\n");
}
int32_t main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
This program shows how to use a streaming VAD with non-streaming ASR in
sherpa-onnx for real-time speech recognition.
(1) SenseVoice
cd /path/to/sherpa-onnx/build
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
./bin/sherpa-onnx-vad-microphone-simulated-streaming-asr \
--silero-vad-model=./silero_vad.onnx \
--sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17/model.int8.onnx \
--tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17/tokens.txt
(2) Parakeet TDT 0.6b v2
cd /path/to/sherpa-onnx/build
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
./bin/sherpa-onnx-vad-microphone-simulated-streaming-asr \
--silero-vad-model=./silero_vad.onnx \
--encoder=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/encoder.int8.onnx \
--decoder=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/decoder.int8.onnx \
--joiner=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/joiner.int8.onnx \
--tokens=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/tokens.txt
(3) Please refer to our doc for more non-streaming ASR models,
e.g., zipformer, paraformer, whisper, etc.
Please first use ./bin/sherpa-onnx-offline to test the RTF of the model.
A model with RTF < 0.2 should work with this program.
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
sherpa_onnx::VadModelConfig vad_config;
sherpa_onnx::OfflineRecognizerConfig asr_config;
vad_config.Register(&po);
asr_config.Register(&po);
int32_t user_device_index = -1; // -1 means to use default value
int32_t user_sample_rate = -1; // -1 means to use default value
po.Register("mic-device-index", &user_device_index,
"If provided, we use it to replace the default device index."
"You can use sherpa-onnx-pa-devs to list available devices");
po.Register("mic-sample-rate", &user_sample_rate,
"If provided, we use it to replace the default sample rate."
"You can use sherpa-onnx-pa-devs to list sample rate of "
"available devices");
if (argc == 1) {
po.PrintUsage();
exit(EXIT_FAILURE);
}
po.Read(argc, argv);
if (po.NumArgs() != 0) {
po.PrintUsage();
exit(EXIT_FAILURE);
}
fprintf(stdout, "%s\n", vad_config.ToString().c_str());
fprintf(stdout, "%s\n", asr_config.ToString().c_str());
if (!vad_config.Validate()) {
fprintf(stdout, "Errors in vad_config!\n");
return -1;
}
if (!asr_config.Validate()) {
fprintf(stdout, "Errors in asr_config!\n");
return -1;
}
fprintf(stdout, "Creating recognizer ...\n");
sherpa_onnx::OfflineRecognizer recognizer(asr_config);
fprintf(stdout, "Recognizer created!\n");
sherpa_onnx::Microphone mic;
int32_t device_index = Pa_GetDefaultInputDevice();
if (device_index == paNoDevice) {
fprintf(stdout, "No default input device found\n");
exit(EXIT_FAILURE);
}
if (user_device_index >= 0) {
fprintf(stdout, "Use specified device: %d\n", user_device_index);
device_index = user_device_index;
} else {
fprintf(stdout, "Use default device: %d\n", device_index);
}
mic.PrintDevices(device_index);
float mic_sample_rate = 16000;
if (user_sample_rate > 0) {
fprintf(stdout, "Use sample rate %d for mic\n", user_sample_rate);
mic_sample_rate = user_sample_rate;
}
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
nullptr)) {
fprintf(stdout, "Failed to open device %d\n", device_index);
exit(EXIT_FAILURE);
}
float sample_rate = 16000;
std::unique_ptr<sherpa_onnx::LinearResample> resampler;
if (mic_sample_rate != sample_rate) {
float min_freq = std::min(mic_sample_rate, sample_rate);
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32_t lowpass_filter_width = 6;
resampler = std::make_unique<sherpa_onnx::LinearResample>(
mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width);
}
auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
int32_t window_size = vad_config.silero_vad.window_size;
int32_t offset = 0;
bool speech_started = false;
std::vector<float> buffer;
auto started_time = std::chrono::steady_clock::now();
sherpa_onnx::SherpaDisplay display;
fprintf(stdout, "Started. Please speak\n");
std::vector<float> resampled;
while (!stop) {
{
std::unique_lock<std::mutex> lock(mutex);
while (samples_queue.empty() && !stop) {
condition_variable.wait(lock);
}
if (stop) {
break;
}
const auto &s = samples_queue.front();
if (!resampler) {
buffer.insert(buffer.end(), s.begin(), s.end());
} else {
resampler->Resample(s.data(), s.size(), false, &resampled);
buffer.insert(buffer.end(), resampled.begin(), resampled.end());
}
samples_queue.pop();
}
for (; offset + window_size < buffer.size(); offset += window_size) {
vad->AcceptWaveform(buffer.data() + offset, window_size);
if (!speech_started && vad->IsSpeechDetected()) {
speech_started = true;
started_time = std::chrono::steady_clock::now();
}
}
if (!speech_started) {
if (buffer.size() > 10 * window_size) {
offset -= buffer.size() - 10 * window_size;
buffer = {buffer.end() - 10 * window_size, buffer.end()};
}
}
auto current_time = std::chrono::steady_clock::now();
const float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
started_time)
.count() /
1000.;
if (speech_started && elapsed_seconds > 0.2) {
auto s = recognizer.CreateStream();
s->AcceptWaveform(sample_rate, buffer.data(), buffer.size());
recognizer.DecodeStream(s.get());
const auto &result = s->GetResult();
display.UpdateText(result.text);
display.Display();
started_time = std::chrono::steady_clock::now();
}
while (!vad->Empty()) {
// when stopping speak, this while loop is executed
vad->Pop();
display.FinalizeCurrentSentence();
display.Display();
buffer.clear();
offset = 0;
speech_started = false;
}
}
return 0;
}