Add a C++ example for simulated streaming ASR (#2607)

2026-01-09 07:41:06 +08:00 · 2025-09-18 14:59:50 +08:00 · 2025-09-18 14:59:50 +08:00 · 86af28157b
commit 86af28157b
parent 9102f34179
19 changed files with 425 additions and 7 deletions
--- a/.github/workflows/aarch64-linux-gnu-shared.yaml
+++ b/.github/workflows/aarch64-linux-gnu-shared.yaml
@ -66,12 +66,17 @@ jobs:
          ./gitcompile
          popd

+          p=$PWD
+
          export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+          export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
          export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

          mkdir build
          cd build
          cmake \
+            -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+            -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
            -DBUILD_SHARED_LIBS=ON \
            -DCMAKE_INSTALL_PREFIX=./install \
            -DSHERPA_ONNX_ENABLE_GPU=ON \
@ -113,13 +118,18 @@ jobs:
              ./gitcompile
              popd

+              p=$PWD
+
              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

              mkdir build
              cd build

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -DBUILD_SHARED_LIBS=ON \
                -DCMAKE_INSTALL_PREFIX=./install \
                ..
@ -245,7 +255,7 @@ jobs:
          file: sherpa-onnx-*linux-aarch64*.tar.bz2
          # repo_name: k2-fsa/sherpa-onnx
          # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
-          # tag: v1.11.5
+          # tag: v1.12.13

      - name: Test offline Moonshine
        if: matrix.build_type != 'Debug'
--- a/.github/workflows/aarch64-linux-gnu-static.yaml
+++ b/.github/workflows/aarch64-linux-gnu-static.yaml
@ -67,11 +67,16 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

+              p=$PWD
+
              mkdir build
              cd build
              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -DBUILD_SHARED_LIBS=OFF \
                -DCMAKE_INSTALL_PREFIX=./install \
                ..
--- a/.github/workflows/build-wheels-aarch64-cuda.yaml
+++ b/.github/workflows/build-wheels-aarch64-cuda.yaml
@ -46,7 +46,15 @@ jobs:
            echo "PWD"
            ls -lh /project/alsa-lib/src/.libs

-          CIBW_ENVIRONMENT: CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR SHERPA_ONNX_MAKE_ARGS="VERBOSE=1" SHERPA_ONNX_ENABLE_ALSA=1 SHERPA_ONNX_ENABLE_GPU=ON
+          CIBW_ENVIRONMENT: >
+            CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
+            C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
+            SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
+            LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
+            SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
+            SHERPA_ONNX_ENABLE_ALSA=1
+            SHERPA_ONNX_ENABLE_GPU=ON
+            SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON -DALSA_INCLUDE_DIR=/project/alsa-lib/include -DALSA_LIBRARY=/project/alsa-lib/src/.libs/libasound.so"
          CIBW_BUILD: "${{ matrix.python-version}}-* "
          CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"
          CIBW_BUILD_VERBOSITY: 3
--- a/.github/workflows/build-wheels-aarch64-rknn.yaml
+++ b/.github/workflows/build-wheels-aarch64-rknn.yaml
@ -109,6 +109,7 @@ jobs:
              strings $PWD/alsa-lib/src/.libs/libasound.so.2.0.0 | grep "^GLIBC"

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

              export SHERPA_ONNX_RKNN_TOOLKIT2_PATH=$PWD/rknn-toolkit2
@ -117,7 +118,7 @@ jobs:

              export SHERPA_ONNX_ENABLE_ALSA=1

-              export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_RKNN=ON"
+              export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_RKNN=ON -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"
              python3 setup.py bdist_wheel

              mv dist wheelhouse
--- a/.github/workflows/build-wheels-aarch64.yaml
+++ b/.github/workflows/build-wheels-aarch64.yaml
@ -326,12 +326,13 @@ jobs:

          CIBW_ENVIRONMENT: >
            SHERPA_ONNX_SPLIT_PYTHON_PACKAGE=ON
+            C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
            CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
            SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
            LD_LIBRARY_PATH=/project/build/bdist.linux-aarch64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
            SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
            SHERPA_ONNX_ENABLE_ALSA=1
-            SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF"
+            SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"

          CIBW_BUILD: "${{ matrix.python-version}}-* "
          CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"
--- a/.github/workflows/build-wheels-linux-cuda.yaml
+++ b/.github/workflows/build-wheels-linux-cuda.yaml
@ -58,6 +58,7 @@ jobs:
        shell: bash
        run: |
          export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+          export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
          export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
          export LD_LIBRARY_PATH=$SHERPA_ONNX_ALSA_LIB_DIR:$LD_LIBRARY_PATH

@ -66,9 +67,11 @@ jobs:
          echo "---"
          ls -lh $PWD/alsa-lib/src/.libs

+          p=$PWD
+
          export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
          export SHERPA_ONNX_ENABLE_ALSA=1
-          export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON"
+          export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"

          onnxruntime_version=${{ matrix.onnxruntime_version }}
          if [[ $onnxruntime_version == "1.22.0" ]]; then
--- a/.github/workflows/build-wheels-linux.yaml
+++ b/.github/workflows/build-wheels-linux.yaml
@ -82,6 +82,7 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

              mkdir build
@ -326,11 +327,12 @@ jobs:
          CIBW_ENVIRONMENT: >
            SHERPA_ONNX_SPLIT_PYTHON_PACKAGE=ON
            CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH
+            C_INCLUDE_PATH=/project/alsa-lib/include:$C_INCLUDE_PATH
            SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs
            LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR
            SHERPA_ONNX_MAKE_ARGS="VERBOSE=1"
            SHERPA_ONNX_ENABLE_ALSA=1
-            SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF"
+            SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_BINARY=OFF -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF -DSHERPA_ONNX_ENABLE_C_API=OFF -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF -DALSA_INCLUDE_DIR=$p/alsa-lib/include -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so"

          CIBW_BUILD: "${{ matrix.python-version}}-* "
          CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"
--- a/.github/workflows/linux-gpu.yaml
+++ b/.github/workflows/linux-gpu.yaml
@ -82,12 +82,17 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

+              p=$PWD
+
              mkdir build
              cd build

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                -D CMAKE_INSTALL_PREFIX=./install \
                -D BUILD_SHARED_LIBS=ON \
--- a/.github/workflows/linux-jni-aarch64.yaml
+++ b/.github/workflows/linux-jni-aarch64.yaml
@ -80,12 +80,16 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
+              p=$PWD

              mkdir build
              cd build

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -D SHERPA_ONNX_ENABLE_TTS=ON \
                -D CMAKE_BUILD_TYPE=Release \
                -D BUILD_SHARED_LIBS=ON \
--- a/.github/workflows/linux-jni.yaml
+++ b/.github/workflows/linux-jni.yaml
@ -113,12 +113,17 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

+              p=$PWD
+
              mkdir build
              cd build

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -D SHERPA_ONNX_ENABLE_TTS=ON \
                -D CMAKE_BUILD_TYPE=Release \
                -D BUILD_SHARED_LIBS=ON \
--- a/.github/workflows/linux.yaml
+++ b/.github/workflows/linux.yaml
@ -110,12 +110,17 @@ jobs:
              popd

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

+              p=$PWD
+
              mkdir build
              cd build

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -D SHERPA_ONNX_ENABLE_TTS=${{ matrix.with_tts }} \
                -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} \
--- a/.github/workflows/rknn-linux-aarch64.yaml
+++ b/.github/workflows/rknn-linux-aarch64.yaml
@ -100,7 +100,9 @@ jobs:
              strings $PWD/alsa-lib/src/.libs/libasound.so.2.0.0 | grep "^GLIBC"

              export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+              export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
              export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
+              p=$PWD

              export SHERPA_ONNX_RKNN_TOOLKIT2_PATH=$PWD/rknn-toolkit2
              export SHERPA_ONNX_RKNN_TOOLKIT2_LIB_DIR=$SHERPA_ONNX_RKNN_TOOLKIT2_PATH/rknpu2/runtime/Linux/librknn_api/aarch64
@ -114,6 +116,8 @@ jobs:
              BUILD_SHARED_LIBS=${{ matrix.shared }}

              cmake \
+                -DALSA_INCLUDE_DIR=$p/alsa-lib/include \
+                -DALSA_LIBRARY=$p/alsa-lib/src/.libs/libasound.so \
                -DBUILD_SHARED_LIBS=ON \
                -DCMAKE_INSTALL_PREFIX=./install \
                -DSHERPA_ONNX_ENABLE_RKNN=ON \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -310,7 +310,7 @@ if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME
 Could not find alsa/asoundlib.h !
 We won't build sherpa-onnx-alsa
 To fix that, please do:
-  (1) sudo apt-get install alsa-utils libasound2-dev
+  (1) sudo apt-get install alsa-utils libasound2-dev pkg-config
  (2) rm -rf build
  (3) re-try
  ")
--- a/build-aarch64-linux-gnu.sh
+++ b/build-aarch64-linux-gnu.sh
@ -72,6 +72,7 @@ if [ ! -f alsa-lib/src/.libs/libasound.so ]; then
 fi

 export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+export C_INCLUDE_PATH=$PWD/alsa-lib/include:$C_INCLUDE_PATH
 export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs

 if [[ x"$BUILD_SHARED_LIBS" == x"" ]]; then
--- a/cmake/cmake_extension.py
+++ b/cmake/cmake_extension.py
@ -73,9 +73,11 @@ def get_binaries():
        "sherpa-onnx-vad",
        "sherpa-onnx-vad-microphone",
        "sherpa-onnx-vad-microphone-offline-asr",
+        "sherpa-onnx-vad-microphone-simulated-streaming-asr",
        "sherpa-onnx-vad-with-offline-asr",
        "sherpa-onnx-vad-with-online-asr",
        "sherpa-onnx-version",
+        "sherpa-onnx-pa-devs",
    ]

    if enable_alsa():
--- a/cmake/portaudio.cmake
+++ b/cmake/portaudio.cmake
@ -29,6 +29,7 @@ function(download_portaudio)
  # Always use static build
  set(PA_BUILD_SHARED OFF CACHE BOOL "" FORCE)
  set(PA_BUILD_STATIC ON CACHE BOOL "" FORCE)
+  set(PA_BUILD_EXAMPLES ON CACHE BOOL "" FORCE)

  FetchContent_Declare(portaudio
    URL
@ -50,6 +51,15 @@ function(download_portaudio)
  endif()

  add_subdirectory(${portaudio_SOURCE_DIR} ${portaudio_BINARY_DIR} EXCLUDE_FROM_ALL)
+  if(CMAKE_SYSTEM_NAME STREQUAL Linux)
+    if(PA_USE_ALSA)
+      message(STATUS "portaudio with ALSA")
+    else()
+      message(STATUS "portaudio without ALSA")
+    endif()
+  endif()
+
+  set_target_properties(pa_devs PROPERTIES OUTPUT_NAME "sherpa-onnx-pa-devs")

  set_target_properties(portaudio_static PROPERTIES OUTPUT_NAME "sherpa-onnx-portaudio_static")
  if(NOT WIN32)
@ -62,6 +72,11 @@ function(download_portaudio)
    DESTINATION lib)
  endif()

+  install(TARGETS
+    pa_devs
+  DESTINATION bin)
+  add_custom_target(build_pa_devs ALL DEPENDS pa_devs)
+
 endfunction()

 download_portaudio()
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@ -518,6 +518,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
    microphone.cc
  )

+  add_executable(sherpa-onnx-vad-microphone-simulated-streaming-asr
+    sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
+    microphone.cc
+  )
+
  add_executable(sherpa-onnx-vad-with-offline-asr
    sherpa-onnx-vad-with-offline-asr.cc
  )
@ -548,6 +553,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
    sherpa-onnx-microphone-offline-audio-tagging
    sherpa-onnx-microphone-offline-speaker-identification
    sherpa-onnx-vad-microphone
+    sherpa-onnx-vad-microphone-simulated-streaming-asr
    sherpa-onnx-vad-microphone-offline-asr
    sherpa-onnx-vad-with-offline-asr
    sherpa-onnx-vad-with-online-asr
--- a/sherpa-onnx/csrc/sherpa-display.h
+++ b/sherpa-onnx/csrc/sherpa-display.h
@ -0,0 +1,73 @@
+// sherpa-onnx/csrc/sherpa-display.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+#pragma once
+
+#include <stdlib.h>
+
+#include <ctime>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace sherpa_onnx {
+
+class SherpaDisplay {
+ public:
+  void UpdateText(const std::string &text) { current_text_ = text; }
+
+  void FinalizeCurrentSentence() {
+    if (!current_text_.empty() &&
+        (current_text_[0] != ' ' || current_text_.size() > 1)) {
+      sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)});
+    }
+  }
+
+  void Display() const {
+    if (!sentences_.empty() || !current_text_.empty()) {
+      ClearScreen();
+    }
+
+    printf("=== Speech Recognition with Next-gen Kaldi ===\n");
+    printf("------------------------------\n");
+    if (!sentences_.empty()) {
+      int32_t i = 1;
+      for (const auto &p : sentences_) {
+        printf("[%s] %d. %s\n", p.first.c_str(), i, p.second.c_str());
+        i += 1;
+      }
+
+      printf("------------------------------\n");
+    }
+
+    if (!current_text_.empty()) {
+      printf("Recognizing: %s\n", current_text_.c_str());
+    }
+  }
+
+ private:
+  static void ClearScreen() {
+#ifdef _MSC_VER
+    auto ret = system("cls");
+#else
+    auto ret = system("clear");
+#endif
+    (void)ret;
+  }
+
+  static std::string GetCurrentDateTime() {
+    std::ostringstream os;
+    auto t = std::time(nullptr);
+    auto tm = std::localtime(&t);
+    os << std::put_time(tm, "%Y-%m-%d %H:%M:%S");
+    return os.str();
+  }
+
+ private:
+  std::vector<std::pair<std::string, std::string>> sentences_;
+  std::string current_text_;
+};
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
+++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
@ -0,0 +1,268 @@
+// sherpa-onnx/csrc/sherpa-onnx-vad-microphone-simulated-streaming-asr.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "portaudio.h"  // NOLINT
+#include "sherpa-onnx/csrc/circular-buffer.h"
+#include "sherpa-onnx/csrc/microphone.h"
+#include "sherpa-onnx/csrc/offline-recognizer.h"
+#include "sherpa-onnx/csrc/resample.h"
+#include "sherpa-onnx/csrc/sherpa-display.h"
+#include "sherpa-onnx/csrc/voice-activity-detector.h"
+#include "sherpa-onnx/csrc/wave-writer.h"
+
+std::queue<std::vector<float>> samples_queue;
+std::condition_variable condition_variable;
+std::mutex mutex;
+bool stop = false;
+
+static int32_t RecordCallback(const void *input_buffer,
+                              void * /*output_buffer*/,
+                              unsigned long frames_per_buffer,  // NOLINT
+                              const PaStreamCallbackTimeInfo * /*time_info*/,
+                              PaStreamCallbackFlags /*status_flags*/,
+                              void * /*user_data*/) {
+  std::lock_guard<std::mutex> lock(mutex);
+  samples_queue.emplace(
+      reinterpret_cast<const float *>(input_buffer),
+      reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
+  condition_variable.notify_one();
+
+  return stop ? paComplete : paContinue;
+}
+
+static void Handler(int32_t /*sig*/) {
+  stop = true;
+  condition_variable.notify_one();
+  fprintf(stdout, "\nCaught Ctrl + C. Exiting...\n");
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+  signal(SIGINT, Handler);
+
+  const char *kUsageMessage = R"usage(
+This program shows how to use a streaming VAD with non-streaming ASR in
+sherpa-onnx for real-time speech recognition.
+
+(1) SenseVoice
+
+cd /path/to/sherpa-onnx/build
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2
+tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+./bin/sherpa-onnx-vad-microphone-simulated-streaming-asr \
+  --silero-vad-model=./silero_vad.onnx \
+  --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17/model.int8.onnx \
+  --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17/tokens.txt
+
+(2) Parakeet TDT 0.6b v2
+
+cd /path/to/sherpa-onnx/build
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
+tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+./bin/sherpa-onnx-vad-microphone-simulated-streaming-asr \
+  --silero-vad-model=./silero_vad.onnx \
+  --encoder=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/encoder.int8.onnx \
+  --decoder=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/decoder.int8.onnx \
+  --joiner=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/joiner.int8.onnx \
+  --tokens=./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/tokens.txt
+
+(3) Please refer to our doc for more non-streaming ASR models,
+e.g., zipformer, paraformer, whisper, etc.
+
+Please first use ./bin/sherpa-onnx-offline to test the RTF of the model.
+A model with RTF < 0.2 should work with this program.
+)usage";
+
+  sherpa_onnx::ParseOptions po(kUsageMessage);
+  sherpa_onnx::VadModelConfig vad_config;
+
+  sherpa_onnx::OfflineRecognizerConfig asr_config;
+
+  vad_config.Register(&po);
+  asr_config.Register(&po);
+
+  int32_t user_device_index = -1;  // -1 means to use default value
+  int32_t user_sample_rate = -1;   // -1 means to use default value
+
+  po.Register("mic-device-index", &user_device_index,
+              "If provided, we use it to replace the default device index."
+              "You can use sherpa-onnx-pa-devs to list available devices");
+
+  po.Register("mic-sample-rate", &user_sample_rate,
+              "If provided, we use it to replace the default sample rate."
+              "You can use sherpa-onnx-pa-devs to list sample rate of "
+              "available devices");
+
+  if (argc == 1) {
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  po.Read(argc, argv);
+  if (po.NumArgs() != 0) {
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(stdout, "%s\n", vad_config.ToString().c_str());
+  fprintf(stdout, "%s\n", asr_config.ToString().c_str());
+
+  if (!vad_config.Validate()) {
+    fprintf(stdout, "Errors in vad_config!\n");
+    return -1;
+  }
+
+  if (!asr_config.Validate()) {
+    fprintf(stdout, "Errors in asr_config!\n");
+    return -1;
+  }
+
+  fprintf(stdout, "Creating recognizer ...\n");
+  sherpa_onnx::OfflineRecognizer recognizer(asr_config);
+  fprintf(stdout, "Recognizer created!\n");
+
+  sherpa_onnx::Microphone mic;
+
+  int32_t device_index = Pa_GetDefaultInputDevice();
+  if (device_index == paNoDevice) {
+    fprintf(stdout, "No default input device found\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (user_device_index >= 0) {
+    fprintf(stdout, "Use specified device: %d\n", user_device_index);
+    device_index = user_device_index;
+  } else {
+    fprintf(stdout, "Use default device: %d\n", device_index);
+  }
+
+  mic.PrintDevices(device_index);
+
+  float mic_sample_rate = 16000;
+  if (user_sample_rate > 0) {
+    fprintf(stdout, "Use sample rate %d for mic\n", user_sample_rate);
+    mic_sample_rate = user_sample_rate;
+  }
+
+  if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
+                      nullptr)) {
+    fprintf(stdout, "Failed to open device %d\n", device_index);
+    exit(EXIT_FAILURE);
+  }
+
+  float sample_rate = 16000;
+  std::unique_ptr<sherpa_onnx::LinearResample> resampler;
+  if (mic_sample_rate != sample_rate) {
+    float min_freq = std::min(mic_sample_rate, sample_rate);
+    float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+
+    int32_t lowpass_filter_width = 6;
+    resampler = std::make_unique<sherpa_onnx::LinearResample>(
+        mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width);
+  }
+
+  auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
+
+  int32_t window_size = vad_config.silero_vad.window_size;
+
+  int32_t offset = 0;
+  bool speech_started = false;
+  std::vector<float> buffer;
+
+  auto started_time = std::chrono::steady_clock::now();
+  sherpa_onnx::SherpaDisplay display;
+
+  fprintf(stdout, "Started. Please speak\n");
+  std::vector<float> resampled;
+
+  while (!stop) {
+    {
+      std::unique_lock<std::mutex> lock(mutex);
+      while (samples_queue.empty() && !stop) {
+        condition_variable.wait(lock);
+      }
+
+      if (stop) {
+        break;
+      }
+
+      const auto &s = samples_queue.front();
+      if (!resampler) {
+        buffer.insert(buffer.end(), s.begin(), s.end());
+      } else {
+        resampler->Resample(s.data(), s.size(), false, &resampled);
+        buffer.insert(buffer.end(), resampled.begin(), resampled.end());
+      }
+
+      samples_queue.pop();
+    }
+
+    for (; offset + window_size < buffer.size(); offset += window_size) {
+      vad->AcceptWaveform(buffer.data() + offset, window_size);
+      if (!speech_started && vad->IsSpeechDetected()) {
+        speech_started = true;
+        started_time = std::chrono::steady_clock::now();
+      }
+    }
+
+    if (!speech_started) {
+      if (buffer.size() > 10 * window_size) {
+        offset -= buffer.size() - 10 * window_size;
+        buffer = {buffer.end() - 10 * window_size, buffer.end()};
+      }
+    }
+
+    auto current_time = std::chrono::steady_clock::now();
+    const float elapsed_seconds =
+        std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
+                                                              started_time)
+            .count() /
+        1000.;
+
+    if (speech_started && elapsed_seconds > 0.2) {
+      auto s = recognizer.CreateStream();
+      s->AcceptWaveform(sample_rate, buffer.data(), buffer.size());
+      recognizer.DecodeStream(s.get());
+      const auto &result = s->GetResult();
+      display.UpdateText(result.text);
+      display.Display();
+
+      started_time = std::chrono::steady_clock::now();
+    }
+
+    while (!vad->Empty()) {
+      // when stopping speak, this while loop is executed
+
+      vad->Pop();
+
+      display.FinalizeCurrentSentence();
+      display.Display();
+
+      buffer.clear();
+      offset = 0;
+      speech_started = false;
+    }
+  }
+
+  return 0;
+}