Replace torchaudio with soundfile in python-api-examples (#765)

This commit is contained in:
gtf35 2024-04-13 23:39:07 +08:00 committed by GitHub
parent 983df28a83
commit b0265b258d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 27 additions and 9 deletions

View File

@ -65,7 +65,7 @@ from typing import Dict, List, Tuple
import numpy as np
import sherpa_onnx
import torchaudio
import soundfile as sf
try:
import sounddevice as sd
@ -357,8 +357,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
samples, sample_rate = torchaudio.load(filename)
return samples[0].contiguous().numpy(), sample_rate
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def compute_speaker_embedding(

View File

@ -60,7 +60,7 @@ from typing import Dict, List, Tuple
import numpy as np
import sherpa_onnx
import torchaudio
import soundfile as sf
try:
import sounddevice as sd
@ -160,8 +160,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
samples, sample_rate = torchaudio.load(filename)
return samples[0].contiguous().numpy(), sample_rate
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def compute_speaker_embedding(

View File

@ -52,7 +52,7 @@ from typing import Dict, List, Tuple
import numpy as np
import sherpa_onnx
import torchaudio
import soundfile as sf
try:
import sounddevice as sd
@ -145,8 +145,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
samples, sample_rate = torchaudio.load(filename)
return samples[0].contiguous().numpy(), sample_rate
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def compute_speaker_embedding(