Add Swift API for MatchaTTS models. (#1684)

2026-01-09 07:41:06 +08:00 · 2025-01-06 07:23:45 +08:00 · 2025-01-06 07:23:45 +08:00 · 6f085babcc
commit 6f085babcc
parent 1fe5fe495f
12 changed files with 271 additions and 18 deletions
--- a/.github/scripts/test-swift.sh
+++ b/.github/scripts/test-swift.sh
@ -7,6 +7,18 @@ echo "pwd: $PWD"
 cd swift-api-examples
 ls -lh

+./run-tts-vits.sh
+ls -lh
+rm -rf vits-piper-*
+
+./run-tts-matcha-zh.sh
+ls -lh
+rm -rf matcha-icefall-*
+
+./run-tts-matcha-en.sh
+ls -lh
+rm -rf matcha-icefall-*
+
 ./run-speaker-diarization.sh
 rm -rf *.onnx
 rm -rf sherpa-onnx-pyannote-segmentation-3-0
@ -38,8 +50,9 @@ popd
 ls -lh /Users/fangjun/Desktop
 cat /Users/fangjun/Desktop/Obama.srt

-./run-tts.sh
-ls -lh
+rm -rf sherpa-onnx-whisper*
+rm -f *.onnx
+rm /Users/fangjun/Desktop/Obama.wav

 ./run-decode-file.sh
 rm decode-file
@ -48,5 +61,4 @@ sed -i.bak  '20d' ./decode-file.swift

 ./run-decode-file-non-streaming.sh

-
 ls -lh
--- a/java-api-examples/run-non-streaming-tts-matcha-en.sh
+++ b/java-api-examples/run-non-streaming-tts-matcha-en.sh
@ -31,7 +31,7 @@ fi
 # to download more models
 if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
-  tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
+  tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  rm matcha-icefall-en_US-ljspeech.tar.bz2
 fi

--- a/nodejs-addon-examples/README.md
+++ b/nodejs-addon-examples/README.md
@ -350,7 +350,7 @@ node ./test_vad_asr_non_streaming_sense_voice_microphone.js
 ### Text-to-speech with MatchaTTS models (English TTS)
 ```bash
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
-tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
+tar xf matcha-icefall-en_US-ljspeech.tar.bz2
 rm matcha-icefall-en_US-ljspeech.tar.bz2

 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
--- a/nodejs-examples/README.md
+++ b/nodejs-examples/README.md
@ -70,7 +70,7 @@ You can use the following command to run it:

 ```bash
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
-tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
+tar xf matcha-icefall-en_US-ljspeech.tar.bz2
 rm matcha-icefall-en_US-ljspeech.tar.bz2

 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
--- a/swift-api-examples/.gitignore
+++ b/swift-api-examples/.gitignore
@ -2,7 +2,7 @@ decode-file
 decode-file-non-streaming
 generate-subtitles
 spoken-language-identification
-tts
+tts-vits
 vits-vctk
 sherpa-onnx-paraformer-zh-2023-09-14
 !*.sh
@ -10,3 +10,5 @@ sherpa-onnx-paraformer-zh-2023-09-14
 streaming-hlg-decode-file
 keyword-spotting-from-file
 add-punctuations
+tts-matcha-zh
+tts-matcha-en
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@ -719,9 +719,9 @@ class SherpaOnnxVoiceActivityDetectorWrapper {

 // offline tts
 func sherpaOnnxOfflineTtsVitsModelConfig(
-  model: String,
-  lexicon: String,
-  tokens: String,
+  model: String = "",
+  lexicon: String = "",
+  tokens: String = "",
  dataDir: String = "",
  noiseScale: Float = 0.667,
  noiseScaleW: Float = 0.8,
@ -739,8 +739,30 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
    dict_dir: toCPointer(dictDir))
 }

+func sherpaOnnxOfflineTtsMatchaModelConfig(
+  acousticModel: String = "",
+  vocoder: String = "",
+  lexicon: String = "",
+  tokens: String = "",
+  dataDir: String = "",
+  noiseScale: Float = 0.667,
+  lengthScale: Float = 1.0,
+  dictDir: String = ""
+) -> SherpaOnnxOfflineTtsMatchaModelConfig {
+  return SherpaOnnxOfflineTtsMatchaModelConfig(
+    acoustic_model: toCPointer(acousticModel),
+    vocoder: toCPointer(vocoder),
+    lexicon: toCPointer(lexicon),
+    tokens: toCPointer(tokens),
+    data_dir: toCPointer(dataDir),
+    noise_scale: noiseScale,
+    length_scale: lengthScale,
+    dict_dir: toCPointer(dictDir))
+}
+
 func sherpaOnnxOfflineTtsModelConfig(
-  vits: SherpaOnnxOfflineTtsVitsModelConfig,
+  vits: SherpaOnnxOfflineTtsVitsModelConfig = sherpaOnnxOfflineTtsVitsModelConfig(),
+  matcha: SherpaOnnxOfflineTtsMatchaModelConfig = sherpaOnnxOfflineTtsMatchaModelConfig(),
  numThreads: Int = 1,
  debug: Int = 0,
  provider: String = "cpu"
@ -749,7 +771,8 @@ func sherpaOnnxOfflineTtsModelConfig(
    vits: vits,
    num_threads: Int32(numThreads),
    debug: Int32(debug),
-    provider: toCPointer(provider)
+    provider: toCPointer(provider),
+    matcha: matcha
  )
 }

--- a/swift-api-examples/run-tts-matcha-en.sh
+++ b/swift-api-examples/run-tts-matcha-en.sh
@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ../build-swift-macos ]; then
+  echo "Please run ../build-swift-macos.sh first!"
+  exit 1
+fi
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+# to download more models
+if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+  tar xf matcha-icefall-en_US-ljspeech.tar.bz2
+  rm matcha-icefall-en_US-ljspeech.tar.bz2
+fi
+
+if [ ! -f ./hifigan_v2.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+fi
+
+if [ ! -e ./tts ]; then
+  # Note: We use -lc++ to link against libc++ instead of libstdc++
+  swiftc \
+    -lc++ \
+    -I ../build-swift-macos/install/include \
+    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+    ./tts-matcha-en.swift  ./SherpaOnnx.swift \
+    -L ../build-swift-macos/install/lib/ \
+    -l sherpa-onnx \
+    -l onnxruntime \
+    -o tts-matcha-en
+
+  strip tts-matcha-en
+else
+  echo "./tts-matcha-en exists - skip building"
+fi
+
+export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+./tts-matcha-en
--- a/swift-api-examples/run-tts-matcha-zh.sh
+++ b/swift-api-examples/run-tts-matcha-zh.sh
@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ../build-swift-macos ]; then
+  echo "Please run ../build-swift-macos.sh first!"
+  exit 1
+fi
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
+# to download more models
+if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+  tar xvf matcha-icefall-zh-baker.tar.bz2
+  rm matcha-icefall-zh-baker.tar.bz2
+fi
+
+if [ ! -f ./hifigan_v2.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+fi
+
+if [ ! -e ./tts ]; then
+  # Note: We use -lc++ to link against libc++ instead of libstdc++
+  swiftc \
+    -lc++ \
+    -I ../build-swift-macos/install/include \
+    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+    ./tts-matcha-zh.swift  ./SherpaOnnx.swift \
+    -L ../build-swift-macos/install/lib/ \
+    -l sherpa-onnx \
+    -l onnxruntime \
+    -o tts-matcha-zh
+
+  strip tts-matcha-zh
+else
+  echo "./tts-matcha-zh exists - skip building"
+fi
+
+export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+./tts-matcha-zh
--- a/swift-api-examples/run-tts-vits.sh
+++ b/swift-api-examples/run-tts-vits.sh
@ -21,16 +21,16 @@ if [ ! -e ./tts ]; then
    -lc++ \
    -I ../build-swift-macos/install/include \
    -import-objc-header ./SherpaOnnx-Bridging-Header.h \
-    ./tts.swift  ./SherpaOnnx.swift \
+    ./tts-vits.swift  ./SherpaOnnx.swift \
    -L ../build-swift-macos/install/lib/ \
    -l sherpa-onnx \
    -l onnxruntime \
-    -o tts
+    -o tts-vits

-  strip tts
+  strip tts-vits
 else
-  echo "./tts exists - skip building"
+  echo "./tts-vits exists - skip building"
 fi

 export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
-./tts
+./tts-vits
--- a/swift-api-examples/tts-matcha-en.swift
+++ b/swift-api-examples/tts-matcha-en.swift
@ -0,0 +1,65 @@
+class MyClass {
+  func playSamples(samples: [Float]) {
+    print("Play \(samples.count) samples")
+  }
+}
+
+func run() {
+  let acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"
+  let vocoder = "./hifigan_v2.onnx"
+  let tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"
+  let dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data"
+  let matcha = sherpaOnnxOfflineTtsMatchaModelConfig(
+    acousticModel: acousticModel,
+    vocoder: vocoder,
+    tokens: tokens,
+    dataDir: dataDir
+  )
+  let modelConfig = sherpaOnnxOfflineTtsModelConfig(matcha: matcha, debug: 0)
+  var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
+
+  let myClass = MyClass()
+
+  // We use Unretained here so myClass must be kept alive as the callback is invoked
+  //
+  // See also
+  // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
+  let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
+
+  let callback: TtsCallbackWithArg = { samples, n, arg in
+    let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
+    var savedSamples: [Float] = []
+    for index in 0..<n {
+      savedSamples.append(samples![Int(index)])
+    }
+
+    o.playSamples(samples: savedSamples)
+
+    // return 1 so that it continues generating
+    return 1
+  }
+
+  let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
+
+  let text =
+    "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
+  let sid = 0
+  let speed: Float = 1.0
+
+  let audio = tts.generateWithCallbackWithArg(
+    text: text, callback: callback, arg: arg, sid: sid, speed: speed)
+  let filename = "test-matcha-en.wav"
+  let ok = audio.save(filename: filename)
+  if ok == 1 {
+    print("\nSaved to:\(filename)")
+  } else {
+    print("Failed to save to \(filename)")
+  }
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}
--- a/swift-api-examples/tts-matcha-zh.swift
+++ b/swift-api-examples/tts-matcha-zh.swift
@ -0,0 +1,68 @@
+class MyClass {
+  func playSamples(samples: [Float]) {
+    print("Play \(samples.count) samples")
+  }
+}
+
+func run() {
+  let acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx"
+  let vocoder = "./hifigan_v2.onnx"
+  let lexicon = "./matcha-icefall-zh-baker/lexicon.txt"
+  let tokens = "./matcha-icefall-zh-baker/tokens.txt"
+  let dictDir = "./matcha-icefall-zh-baker/dict"
+  let ruleFsts =
+    "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"
+  let matcha = sherpaOnnxOfflineTtsMatchaModelConfig(
+    acousticModel: acousticModel,
+    vocoder: vocoder,
+    lexicon: lexicon,
+    tokens: tokens,
+    dictDir: dictDir
+  )
+  let modelConfig = sherpaOnnxOfflineTtsModelConfig(matcha: matcha, debug: 0)
+  var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig, ruleFsts: ruleFsts)
+
+  let myClass = MyClass()
+
+  // We use Unretained here so myClass must be kept alive as the callback is invoked
+  //
+  // See also
+  // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
+  let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
+
+  let callback: TtsCallbackWithArg = { samples, n, arg in
+    let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
+    var savedSamples: [Float] = []
+    for index in 0..<n {
+      savedSamples.append(samples![Int(index)])
+    }
+
+    o.playSamples(samples: savedSamples)
+
+    // return 1 so that it continues generating
+    return 1
+  }
+
+  let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
+
+  let text = "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。"
+  let sid = 0
+  let speed: Float = 1.0
+
+  let audio = tts.generateWithCallbackWithArg(
+    text: text, callback: callback, arg: arg, sid: sid, speed: speed)
+  let filename = "test-matcha-zh.wav"
+  let ok = audio.save(filename: filename)
+  if ok == 1 {
+    print("\nSaved to:\(filename)")
+  } else {
+    print("Failed to save to \(filename)")
+  }
+}
+
+@main
+struct App {
+  static func main() {
+    run()
+  }
+}
--- a/swift-api-examples/tts-vits.swift
+++ b/swift-api-examples/tts-vits.swift
@ -47,7 +47,7 @@ func run() {

  let audio = tts.generateWithCallbackWithArg(
    text: text, callback: callback, arg: arg, sid: sid, speed: speed)
-  let filename = "test.wav"
+  let filename = "test-vits-en.wav"
  let ok = audio.save(filename: filename)
  if ok == 1 {
    print("\nSaved to:\(filename)")