Generate tts samples for MatchaTTS (English). (#2527)

2026-01-09 07:41:06 +08:00 · 2025-08-25 16:04:50 +08:00 · 2025-08-25 16:04:50 +08:00 · f1f8149a47
commit f1f8149a47
parent 4694d675bd
3 changed files with 75 additions and 3 deletions
--- a/.github/workflows/generate-tts-samples.yaml
+++ b/.github/workflows/generate-tts-samples.yaml
@ -32,7 +32,7 @@ jobs:
          pip install "numpy<=1.26.4" sherpa-onnx soundfile

      - name: kitten
-        if: true
+        if: false
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@ -68,3 +68,37 @@ jobs:
          git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
          popd
          rm -rf hf
+
+      - name: matcha en (ljspeech)
+        if: true
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git config --global user.email "csukuangfj@gmail.com"
+          git config --global user.name "Fangjun Kuang"
+
+          cd scripts/matcha-tts/en/
+          pwd=$PWD
+
+          export GIT_LFS_SKIP_SMUDGE=1
+          export GIT_CLONE_PROTECTION_ACTIVE=false
+          git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
+
+          mkdir -p ./hf/matcha/icefall-en-ljspeech/mp3
+          curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+          tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
+          rm matcha-icefall-en_US-ljspeech.tar.bz2
+
+          curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
+
+          python3 ./generate_samples.py
+
+          pushd hf
+          git pull
+          git add .
+          git commit -m 'add matcha tts en (ljspeech) samples'
+          git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
+          popd
+
+          rm -rf hf
--- a/scripts/matcha-tts/en/generate_samples.py
+++ b/scripts/matcha-tts/en/generate_samples.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+"""
+Generate samples for
+https://k2-fsa.github.io/sherpa/onnx/tts/all/
+"""
+
+
+import sherpa_onnx
+import soundfile as sf
+
+config = sherpa_onnx.OfflineTtsConfig(
+    model=sherpa_onnx.OfflineTtsModelConfig(
+        matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
+            acoustic_model="matcha-icefall-en_US-ljspeech/model-steps-3.onnx",
+            vocoder="vocos-22khz-univ.onnx",
+            tokens="matcha-icefall-en_US-ljspeech/tokens.txt",
+            lexicon="",
+            data_dir="matcha-icefall-en_US-ljspeech/espeak-ng-data",
+        ),
+        num_threads=2,
+    ),
+    max_num_sentences=1,
+)
+
+if not config.validate():
+    raise ValueError("Please check your config")
+
+tts = sherpa_onnx.OfflineTts(config)
+text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
+
+audio = tts.generate(text, sid=0, speed=1.0)
+
+sf.write(
+    "./hf/matcha/icefall-en-ljspeech/mp3/0.mp3",
+    audio.samples,
+    samplerate=audio.sample_rate,
+)
--- a/sherpa-onnx/python/csrc/offline-tts-matcha-model-config.cc
+++ b/sherpa-onnx/python/csrc/offline-tts-matcha-model-config.cc
@ -18,8 +18,8 @@ void PybindOfflineTtsMatchaModelConfig(py::module *m) {
      .def(py::init<const std::string &, const std::string &,
                    const std::string &, const std::string &,
                    const std::string &, const std::string &, float, float>(),
-           py::arg("acoustic_model"), py::arg("vocoder"), py::arg("lexicon"),
-           py::arg("tokens"), py::arg("data_dir") = "",
+           py::arg("acoustic_model"), py::arg("vocoder"),
+           py::arg("lexicon") = "", py::arg("tokens"), py::arg("data_dir") = "",
           py::arg("dict_dir") = "", py::arg("noise_scale") = 1.0,
           py::arg("length_scale") = 1.0)
      .def_readwrite("acoustic_model", &PyClass::acoustic_model)