mirror of
https://github.com/k2-fsa/sherpa-onnx.git
synced 2026-01-09 07:41:06 +08:00
Export https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3 to sherpa-onnx (#2500)
This PR adds support for the newer version (v3) of NVIDIA's parakeet-tdt-0.6b model by exporting it to sherpa-onnx format. The v3 model supports 25 languages, maintaining the same usage pattern as v2 but with improved language coverage.
This commit is contained in:
parent
4dfb39c509
commit
091e6ff695
@ -1,4 +1,4 @@
|
||||
name: export-nemo-parakeet-tdt-0.6b-v2
|
||||
name: export-nemo-parakeet-tdt-0.6b
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -10,80 +10,110 @@ concurrency:
|
||||
group: export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
HF_HUB_ENABLE_HF_TRANSFER: "0"
|
||||
|
||||
jobs:
|
||||
export-nemo-parakeet-tdt-0_6b-v2:
|
||||
export-nemo-parakeet-tdt-0_6b:
|
||||
if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
|
||||
name: parakeet tdt 0.6b v2
|
||||
name: parakeet tdt 0.6b ${{ matrix.version }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-latest]
|
||||
python-version: ["3.10"]
|
||||
version: ["v2", "v3"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Show disk space
|
||||
run: |
|
||||
df -h
|
||||
|
||||
# See https://github.com/vlayer-xyz/vlayer/pull/543/files
|
||||
# Free up disk space as the macOS runners end up using most for Xcode
|
||||
# versions we don't need and use iOS simulators.
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
echo '*** Delete iOS simulators and their caches'
|
||||
xcrun simctl delete all
|
||||
sudo rm -rf ~/Library/Developer/CoreSimulator/Caches/*
|
||||
|
||||
- name: Show disk space
|
||||
run: |
|
||||
df -h
|
||||
|
||||
- name: Setup Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Run
|
||||
- name: Run ${{ matrix.version }}
|
||||
if: matrix.version == 'v2'
|
||||
shell: bash
|
||||
run: |
|
||||
cd scripts/nemo/parakeet-tdt-0.6b-v2
|
||||
./run.sh
|
||||
|
||||
ls -lh *.onnx
|
||||
ls -lh *.weights
|
||||
|
||||
mv -v *.onnx ../../..
|
||||
mv -v *.weights ../../..
|
||||
mv -v tokens.txt ../../..
|
||||
mv 2086-149220-0033.wav ../../../0.wav
|
||||
|
||||
- name: Run ${{ matrix.version }}
|
||||
if: matrix.version == 'v3'
|
||||
shell: bash
|
||||
run: |
|
||||
cd scripts/nemo/parakeet-tdt-0.6b-v3
|
||||
./run.sh
|
||||
|
||||
ls -lh *.onnx
|
||||
mv -v *.onnx ../../..
|
||||
mv -v *.weights ../../..
|
||||
mv -v tokens.txt ../../..
|
||||
mv *.wav ../../../
|
||||
|
||||
- name: Collect files (fp32)
|
||||
shell: bash
|
||||
run: |
|
||||
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
|
||||
version=${{ matrix.version }}
|
||||
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
|
||||
mkdir -p $d
|
||||
cp encoder.int8.onnx $d
|
||||
cp decoder.onnx $d
|
||||
cp joiner.onnx $d
|
||||
cp tokens.txt $d
|
||||
cp -v encoder.onnx $d
|
||||
cp -v encoder.weights $d
|
||||
cp -v decoder.onnx $d
|
||||
cp -v joiner.onnx $d
|
||||
cp -v tokens.txt $d
|
||||
|
||||
mkdir $d/test_wavs
|
||||
cp 0.wav $d/test_wavs
|
||||
cp -v *.wav $d/test_wavs
|
||||
|
||||
tar cjfv $d.tar.bz2 $d
|
||||
# tar cjfv $d.tar.bz2 $d
|
||||
|
||||
# ls -lh *.tar.bz2
|
||||
|
||||
- name: Collect files (int8)
|
||||
shell: bash
|
||||
run: |
|
||||
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
|
||||
version=${{ matrix.version }}
|
||||
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
|
||||
mkdir -p $d
|
||||
cp encoder.int8.onnx $d
|
||||
cp decoder.int8.onnx $d
|
||||
cp joiner.int8.onnx $d
|
||||
cp tokens.txt $d
|
||||
cp -v encoder.int8.onnx $d
|
||||
cp -v decoder.int8.onnx $d
|
||||
cp -v joiner.int8.onnx $d
|
||||
cp -v tokens.txt $d
|
||||
|
||||
mkdir $d/test_wavs
|
||||
cp 0.wav $d/test_wavs
|
||||
cp -v *.wav $d/test_wavs
|
||||
|
||||
tar cjfv $d.tar.bz2 $d
|
||||
|
||||
- name: Collect files (fp16)
|
||||
shell: bash
|
||||
run: |
|
||||
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
|
||||
mkdir -p $d
|
||||
cp encoder.fp16.onnx $d
|
||||
cp decoder.fp16.onnx $d
|
||||
cp joiner.fp16.onnx $d
|
||||
cp tokens.txt $d
|
||||
|
||||
mkdir $d/test_wavs
|
||||
cp 0.wav $d/test_wavs
|
||||
|
||||
tar cjfv $d.tar.bz2 $d
|
||||
ls -lh *.tar.bz2
|
||||
|
||||
- name: Publish to huggingface
|
||||
env:
|
||||
@ -94,13 +124,13 @@ jobs:
|
||||
timeout_seconds: 200
|
||||
shell: bash
|
||||
command: |
|
||||
version=${{ matrix.version }}
|
||||
git config --global user.email "csukuangfj@gmail.com"
|
||||
git config --global user.name "Fangjun Kuang"
|
||||
|
||||
models=(
|
||||
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
|
||||
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
|
||||
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
|
||||
sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
|
||||
sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
|
||||
)
|
||||
|
||||
for m in ${models[@]}; do
|
||||
@ -112,6 +142,7 @@ jobs:
|
||||
cd huggingface
|
||||
git lfs track "*.onnx"
|
||||
git lfs track "*.wav"
|
||||
git lfs track "*.weights"
|
||||
git status
|
||||
git add .
|
||||
git status
|
||||
|
||||
@ -678,6 +678,22 @@ def get_models():
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
Model(
|
||||
model_name="sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
|
||||
idx=40,
|
||||
lang="multi",
|
||||
lang2="25_languages",
|
||||
short_name="parakeet_tdt_0.6b_v3",
|
||||
cmd="""
|
||||
pushd $model_name
|
||||
|
||||
rm -rfv test_wavs
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
|
||||
@ -1,34 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import os
|
||||
|
||||
import nemo.collections.asr as nemo_asr
|
||||
import onnx
|
||||
import onnxmltools
|
||||
import torch
|
||||
from onnxmltools.utils.float16_converter import (
|
||||
convert_float_to_float16,
|
||||
convert_float_to_float16_model_path,
|
||||
)
|
||||
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||
|
||||
|
||||
def export_onnx_fp16(onnx_fp32_path, onnx_fp16_path):
|
||||
onnx_fp32_model = onnxmltools.utils.load_model(onnx_fp32_path)
|
||||
onnx_fp16_model = convert_float_to_float16(onnx_fp32_model, keep_io_types=True)
|
||||
onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)
|
||||
|
||||
|
||||
def export_onnx_fp16_large_2gb(onnx_fp32_path, onnx_fp16_path):
|
||||
onnx_fp16_model = convert_float_to_float16_model_path(
|
||||
onnx_fp32_path, keep_io_types=True
|
||||
)
|
||||
onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)
|
||||
|
||||
|
||||
def add_meta_data(filename: str, meta_data: Dict[str, str]):
|
||||
"""Add meta data to an ONNX model. It is changed in-place.
|
||||
|
||||
@ -47,14 +29,29 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]):
|
||||
meta.key = key
|
||||
meta.value = str(value)
|
||||
|
||||
onnx.save(model, filename)
|
||||
if filename == "encoder.onnx":
|
||||
external_filename = "encoder"
|
||||
onnx.save(
|
||||
model,
|
||||
filename,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=True,
|
||||
location=external_filename + ".weights",
|
||||
)
|
||||
else:
|
||||
onnx.save(model, filename)
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def main():
|
||||
asr_model = nemo_asr.models.ASRModel.from_pretrained(
|
||||
model_name="nvidia/parakeet-tdt-0.6b-v2"
|
||||
)
|
||||
if Path("./parakeet-tdt-0.6b-v2.nemo").is_file():
|
||||
asr_model = nemo_asr.models.ASRModel.restore_from(
|
||||
restore_path="./parakeet-tdt-0.6b-v2.nemo"
|
||||
)
|
||||
else:
|
||||
asr_model = nemo_asr.models.ASRModel.from_pretrained(
|
||||
model_name="nvidia/parakeet-tdt-0.6b-v2"
|
||||
)
|
||||
|
||||
asr_model.eval()
|
||||
|
||||
@ -95,13 +92,8 @@ def main():
|
||||
)
|
||||
os.system("ls -lh *.onnx")
|
||||
|
||||
if m == "encoder":
|
||||
export_onnx_fp16_large_2gb(f"{m}.onnx", f"{m}.fp16.onnx")
|
||||
else:
|
||||
export_onnx_fp16(f"{m}.onnx", f"{m}.fp16.onnx")
|
||||
|
||||
add_meta_data("encoder.int8.onnx", meta_data)
|
||||
add_meta_data("encoder.fp16.onnx", meta_data)
|
||||
add_meta_data("encoder.onnx", meta_data)
|
||||
print("meta_data", meta_data)
|
||||
|
||||
|
||||
|
||||
@ -9,8 +9,9 @@ log() {
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
||||
curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2/resolve/main/parakeet-tdt-0.6b-v2.nemo
|
||||
|
||||
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
||||
|
||||
|
||||
pip install \
|
||||
@ -20,7 +21,6 @@ pip install \
|
||||
kaldi-native-fbank \
|
||||
librosa \
|
||||
onnx==1.17.0 \
|
||||
onnxmltools \
|
||||
onnxruntime==1.17.1 \
|
||||
soundfile
|
||||
|
||||
@ -42,11 +42,3 @@ python3 ./test_onnx.py \
|
||||
--joiner ./joiner.int8.onnx \
|
||||
--tokens ./tokens.txt \
|
||||
--wav 2086-149220-0033.wav
|
||||
|
||||
echo "---fp16----"
|
||||
python3 ./test_onnx.py \
|
||||
--encoder ./encoder.fp16.onnx \
|
||||
--decoder ./decoder.fp16.onnx \
|
||||
--joiner ./joiner.fp16.onnx \
|
||||
--tokens ./tokens.txt \
|
||||
--wav 2086-149220-0033.wav
|
||||
|
||||
101
scripts/nemo/parakeet-tdt-0.6b-v3/export_onnx.py
Executable file
101
scripts/nemo/parakeet-tdt-0.6b-v3/export_onnx.py
Executable file
@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import os
|
||||
|
||||
import nemo.collections.asr as nemo_asr
|
||||
import onnx
|
||||
import torch
|
||||
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||
|
||||
|
||||
def add_meta_data(filename: str, meta_data: Dict[str, str]):
|
||||
"""Add meta data to an ONNX model. It is changed in-place.
|
||||
|
||||
Args:
|
||||
filename:
|
||||
Filename of the ONNX model to be changed.
|
||||
meta_data:
|
||||
Key-value pairs.
|
||||
"""
|
||||
model = onnx.load(filename)
|
||||
while len(model.metadata_props):
|
||||
model.metadata_props.pop()
|
||||
|
||||
for key, value in meta_data.items():
|
||||
meta = model.metadata_props.add()
|
||||
meta.key = key
|
||||
meta.value = str(value)
|
||||
|
||||
if filename == "encoder.onnx":
|
||||
external_filename = "encoder"
|
||||
onnx.save(
|
||||
model,
|
||||
filename,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=True,
|
||||
location=external_filename + ".weights",
|
||||
)
|
||||
else:
|
||||
onnx.save(model, filename)
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def main():
|
||||
if Path("./parakeet-tdt-0.6b-v3.nemo").is_file():
|
||||
asr_model = nemo_asr.models.ASRModel.restore_from(
|
||||
restore_path="./parakeet-tdt-0.6b-v3.nemo"
|
||||
)
|
||||
else:
|
||||
asr_model = nemo_asr.models.ASRModel.from_pretrained(
|
||||
model_name="nvidia/parakeet-tdt-0.6b-v3"
|
||||
)
|
||||
|
||||
asr_model.eval()
|
||||
|
||||
with open("./tokens.txt", "w", encoding="utf-8") as f:
|
||||
for i, s in enumerate(asr_model.joint.vocabulary):
|
||||
f.write(f"{s} {i}\n")
|
||||
f.write(f"<blk> {i+1}\n")
|
||||
print("Saved to tokens.txt")
|
||||
|
||||
asr_model.encoder.export("encoder.onnx")
|
||||
asr_model.decoder.export("decoder.onnx")
|
||||
asr_model.joint.export("joiner.onnx")
|
||||
os.system("ls -lh *.onnx")
|
||||
|
||||
normalize_type = asr_model.cfg.preprocessor.normalize
|
||||
if normalize_type == "NA":
|
||||
normalize_type = ""
|
||||
|
||||
meta_data = {
|
||||
"vocab_size": asr_model.decoder.vocab_size,
|
||||
"normalize_type": normalize_type,
|
||||
"pred_rnn_layers": asr_model.decoder.pred_rnn_layers,
|
||||
"pred_hidden": asr_model.decoder.pred_hidden,
|
||||
"subsampling_factor": 8,
|
||||
"model_type": "EncDecRNNTBPEModel",
|
||||
"version": "2",
|
||||
"model_author": "NeMo",
|
||||
"url": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3",
|
||||
"comment": "Only the transducer branch is exported",
|
||||
"feat_dim": 128,
|
||||
}
|
||||
|
||||
for m in ["encoder", "decoder", "joiner"]:
|
||||
quantize_dynamic(
|
||||
model_input=f"./{m}.onnx",
|
||||
model_output=f"./{m}.int8.onnx",
|
||||
weight_type=QuantType.QUInt8 if m == "encoder" else QuantType.QInt8,
|
||||
)
|
||||
os.system("ls -lh *.onnx")
|
||||
|
||||
add_meta_data("encoder.int8.onnx", meta_data)
|
||||
add_meta_data("encoder.onnx", meta_data)
|
||||
print("meta_data", meta_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
51
scripts/nemo/parakeet-tdt-0.6b-v3/run.sh
Executable file
51
scripts/nemo/parakeet-tdt-0.6b-v3/run.sh
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
|
||||
set -ex
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3/resolve/main/parakeet-tdt-0.6b-v3.nemo
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/en.wav
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/fr.wav
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/es.wav
|
||||
|
||||
ls -lh
|
||||
|
||||
|
||||
pip install \
|
||||
nemo_toolkit['asr'] \
|
||||
"numpy<2" \
|
||||
ipython \
|
||||
kaldi-native-fbank \
|
||||
librosa \
|
||||
onnx==1.17.0 \
|
||||
onnxruntime==1.17.1 \
|
||||
soundfile
|
||||
|
||||
python3 ./export_onnx.py
|
||||
ls -lh *.onnx
|
||||
|
||||
for w in en.wav de.wav fr.wav es.wav; do
|
||||
echo "---fp32----"
|
||||
python3 ./test_onnx.py \
|
||||
--encoder ./encoder.int8.onnx \
|
||||
--decoder ./decoder.onnx \
|
||||
--joiner ./joiner.onnx \
|
||||
--tokens ./tokens.txt \
|
||||
--wav $w
|
||||
|
||||
echo "---int8----"
|
||||
python3 ./test_onnx.py \
|
||||
--encoder ./encoder.int8.onnx \
|
||||
--decoder ./decoder.int8.onnx \
|
||||
--joiner ./joiner.int8.onnx \
|
||||
--tokens ./tokens.txt \
|
||||
--wav $w
|
||||
done
|
||||
1
scripts/nemo/parakeet-tdt-0.6b-v3/test_onnx.py
Symbolic link
1
scripts/nemo/parakeet-tdt-0.6b-v3/test_onnx.py
Symbolic link
@ -0,0 +1 @@
|
||||
../parakeet-tdt-0.6b-v2/test_onnx.py
|
||||
@ -46,7 +46,7 @@ bool OfflineTtsModelConfig::Validate() const {
|
||||
return kitten.Validate();
|
||||
}
|
||||
|
||||
SHERPA_ONNX_LOGE("Please provide at exactly one tts model.");
|
||||
SHERPA_ONNX_LOGE("Please provide exactly one tts model.");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -65,7 +65,7 @@ struct GeneratedAudio {
|
||||
|
||||
class OfflineTtsImpl;
|
||||
|
||||
// If the callback returns 0, then it stop generating
|
||||
// If the callback returns 0, then it stops generating
|
||||
// if the callback returns 1, then it keeps generating
|
||||
using GeneratedAudioCallback = std::function<int32_t(
|
||||
const float * /*samples*/, int32_t /*n*/, float /*progress*/)>;
|
||||
|
||||
@ -677,6 +677,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
)
|
||||
}
|
||||
|
||||
40 -> {
|
||||
val modelDir = "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8"
|
||||
return OfflineModelConfig(
|
||||
transducer = OfflineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder.int8.onnx",
|
||||
decoder = "$modelDir/decoder.int8.onnx",
|
||||
joiner = "$modelDir/joiner.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "nemo_transducer",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user