2025-12-29 12:45:09 +08:00

2944 lines
91 KiB
ObjectPascal

{ Copyright (c) 2024 Xiaomi Corporation
Please see
https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples
for how to use APIs in this file.
}
unit sherpa_onnx;
{$IFDEF FPC}
{$mode objfpc}
{$modeSwitch advancedRecords} { to support records with methods }
{$ENDIF}
{$LongStrings ON}
interface
uses
ctypes;
type
TSherpaOnnxSamplesArray = array of Single;
TSherpaOnnxLinearResampler = class
private
Handle: Pointer;
InputSampleRate: Integer;
OutputSampleRate: Integer;
public
constructor Create(SampleRateIn: Integer; SampleRateOut: Integer);
destructor Destroy; override;
function Resample(Samples: pcfloat;
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload;
function Resample(Samples: array of Single;
Flush: Boolean): TSherpaOnnxSamplesArray; overload;
procedure Reset;
property GetInputSampleRate: Integer Read InputSampleRate;
property GetOutputSampleRate: Integer Read OutputSampleRate;
end;
PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg;
TSherpaOnnxGeneratedAudioCallbackWithArg = function(
Samples: pcfloat; N: cint32;
Arg: Pointer): cint; cdecl;
TSherpaOnnxOfflineTtsVitsModelConfig = record
Model: AnsiString;
Lexicon: AnsiString;
Tokens: AnsiString;
DataDir: AnsiString;
NoiseScale: Single;
NoiseScaleW: Single;
LengthScale: Single;
DictDir: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
end;
TSherpaOnnxOfflineTtsMatchaModelConfig = record
AcousticModel: AnsiString;
Vocoder: AnsiString;
Lexicon: AnsiString;
Tokens: AnsiString;
DataDir: AnsiString;
NoiseScale: Single;
LengthScale: Single;
DictDir: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
end;
TSherpaOnnxOfflineTtsKokoroModelConfig = record
Model: AnsiString;
Voices: AnsiString;
Tokens: AnsiString;
DataDir: AnsiString;
LengthScale: Single;
DictDir: AnsiString;
Lexicon: AnsiString;
Lang: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
end;
TSherpaOnnxOfflineTtsKittenModelConfig = record
Model: AnsiString;
Voices: AnsiString;
Tokens: AnsiString;
DataDir: AnsiString;
LengthScale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKittenModelConfig);
end;
TSherpaOnnxOfflineTtsZipVoiceModelConfig = record
Tokens: AnsiString;
Encoder: AnsiString;
Decoder: AnsiString;
Vocoder: AnsiString;
DataDir: AnsiString;
Lexicon: AnsiString;
FeatScale: Single;
Tshift: Single;
TargetRms: Single;
GuidanceScale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsZipVoiceModelConfig);
end;
TSherpaOnnxOfflineTtsModelConfig = record
Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig;
Kokoro: TSherpaOnnxOfflineTtsKokoroModelConfig;
Kitten: TSherpaOnnxOfflineTtsKittenModelConfig;
ZipVoice: TSherpaOnnxOfflineTtsZipVoiceModelConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
end;
TSherpaOnnxOfflineTtsConfig = record
Model: TSherpaOnnxOfflineTtsModelConfig;
RuleFsts: AnsiString;
MaxNumSentences: Integer;
RuleFars: AnsiString;
SilenceScale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
end;
TSherpaOnnxGeneratedAudio = record
Samples: array of Single;
SampleRate: Integer;
end;
TSherpaOnnxOfflineTts = class
private
Handle: Pointer;
SampleRate: Integer;
NumSpeakers: Integer;
_Config: TSherpaOnnxOfflineTtsConfig;
public
constructor Create(Config: TSherpaOnnxOfflineTtsConfig);
destructor Destroy; override;
function Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single): TSherpaOnnxGeneratedAudio; overload;
function Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single;
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer
): TSherpaOnnxGeneratedAudio; overload;
property GetHandle: Pointer Read Handle;
property GetSampleRate: Integer Read SampleRate;
property GetNumSpeakers: Integer Read NumSpeakers;
end;
TSherpaOnnxWave = record
Samples: array of Single; { normalized to the range [-1, 1] }
SampleRate: Integer;
end;
TSherpaOnnxOnlineTransducerModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
Joiner: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineParaformerModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineZipformer2CtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineNemoCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineToneCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineModelConfig = record
Transducer: TSherpaOnnxOnlineTransducerModelConfig;
Paraformer: TSherpaOnnxOnlineParaformerModelConfig;
Zipformer2Ctc: TSherpaOnnxOnlineZipformer2CtcModelConfig;
Tokens: AnsiString;
NumThreads: Integer;
Provider: AnsiString;
Debug: Boolean;
ModelType: AnsiString;
ModelingUnit: AnsiString;
BpeVocab: AnsiString;
TokensBuf: AnsiString;
TokensBufSize: Integer;
NemoCtc: TSherpaOnnxOnlineNemoCtcModelConfig;
ToneCtc: TSherpaOnnxOnlineToneCtcModelConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
end;
TSherpaOnnxFeatureConfig = record
SampleRate: Integer;
FeatureDim: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
end;
TSherpaOnnxOnlineCtcFstDecoderConfig = record
Graph: AnsiString;
MaxActive: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
end;
TSherpaOnnxHomophoneReplacerConfig = record
DictDir: AnsiString;
Lexicon: AnsiString;
RuleFsts: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineRecognizerConfig = record
FeatConfig: TSherpaOnnxFeatureConfig;
ModelConfig: TSherpaOnnxOnlineModelConfig;
DecodingMethod: AnsiString;
MaxActivePaths: Integer;
EnableEndpoint: Boolean;
Rule1MinTrailingSilence: Single;
Rule2MinTrailingSilence: Single;
Rule3MinUtteranceLength: Single;
HotwordsFile: AnsiString;
HotwordsScore: Single;
CtcFstDecoderConfig: TSherpaOnnxOnlineCtcFstDecoderConfig;
RuleFsts: AnsiString;
RuleFars: AnsiString;
BlankPenalty: Single;
HotwordsBuf: AnsiString;
HotwordsBufSize: Integer;
Hr: TSherpaOnnxHomophoneReplacerConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
end;
TSherpaOnnxOnlineRecognizerResult = record
Text: AnsiString;
Tokens: array of AnsiString;
Timestamps: array of Single;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineStream = class
private
Handle: Pointer;
public
constructor Create(P: Pointer);
destructor Destroy; override;
procedure AcceptWaveform(Samples: array of Single; SampleRate: Integer);
procedure InputFinished;
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxOnlineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOnlineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
destructor Destroy; override;
function CreateStream: TSherpaOnnxOnlineStream; overload;
function CreateStream(Hotwords: AnsiString): TSherpaOnnxOnlineStream; overload;
function IsReady(Stream: TSherpaOnnxOnlineStream): Boolean;
procedure Decode(Stream: TSherpaOnnxOnlineStream);
procedure Reset(Stream: TSherpaOnnxOnlineStream);
function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxOfflineTransducerModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
Joiner: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineParaformerModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineNemoEncDecCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineDolphinModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineZipformerCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineWenetCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineOmnilingualAsrCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineMedAsrCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineWhisperModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
Language: AnsiString;
Task: AnsiString;
TailPaddings: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
end;
TSherpaOnnxOfflineCanaryModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
SrcLang: AnsiString;
TgtLang: AnsiString;
UsePnc: Boolean;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineCanaryModelConfig);
end;
TSherpaOnnxOfflineMoonshineModelConfig = record
Preprocessor: AnsiString;
Encoder: AnsiString;
UncachedDecoder: AnsiString;
CachedDecoder: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineFireRedAsrModelConfig = record
Encoder: AnsiString;
Decoder: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineTdnnModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineLMConfig = record
Model: AnsiString;
Scale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
end;
TSherpaOnnxOfflineSenseVoiceModelConfig = record
Model: AnsiString;
Language: AnsiString;
UseItn: Boolean;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
function ToString: AnsiString;
end;
TSherpaOnnxOfflineModelConfig = record
Transducer: TSherpaOnnxOfflineTransducerModelConfig;
Paraformer: TSherpaOnnxOfflineParaformerModelConfig;
NeMoCtc: TSherpaOnnxOfflineNemoEncDecCtcModelConfig;
Whisper: TSherpaOnnxOfflineWhisperModelConfig;
Tdnn: TSherpaOnnxOfflineTdnnModelConfig;
Tokens: AnsiString;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
ModelType: AnsiString;
ModelingUnit: AnsiString;
BpeVocab: AnsiString;
TeleSpeechCtc: AnsiString;
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
Canary: TSherpaOnnxOfflineCanaryModelConfig;
WenetCtc: TSherpaOnnxOfflineWenetCtcModelConfig;
Omnilingual: TSherpaOnnxOfflineOmnilingualAsrCtcModelConfig;
MedAsr: TSherpaOnnxOfflineMedAsrCtcModelConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
function ToString: AnsiString;
end;
TSherpaOnnxOfflineRecognizerConfig = record
FeatConfig: TSherpaOnnxFeatureConfig;
ModelConfig: TSherpaOnnxOfflineModelConfig;
LMConfig: TSherpaOnnxOfflineLMConfig;
DecodingMethod: AnsiString;
MaxActivePaths: Integer;
HotwordsFile: AnsiString;
HotwordsScore: Single;
RuleFsts: AnsiString;
RuleFars: AnsiString;
BlankPenalty: Single;
Hr: TSherpaOnnxHomophoneReplacerConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
function ToString: AnsiString;
end;
TSherpaOnnxOfflineRecognizerResult = record
Text: AnsiString;
Tokens: array of AnsiString;
Timestamps: array of Single;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineStream = class
private
Handle: Pointer;
public
constructor Create(P: Pointer);
destructor Destroy; override;
procedure AcceptWaveform(Samples: array of Single; SampleRate: Integer);
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxOfflineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOfflineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
destructor Destroy; override;
function CreateStream: TSherpaOnnxOfflineStream;
procedure Decode(Stream: TSherpaOnnxOfflineStream);
procedure SetConfig(Config: TSherpaOnnxOfflineRecognizerConfig);
function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxSileroVadModelConfig = record
Model: AnsiString;
Threshold: Single;
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
MaxSpeechDuration: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
TSherpaOnnxTenVadModelConfig = record
Model: AnsiString;
Threshold: Single;
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
MaxSpeechDuration: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
end;
TSherpaOnnxVadModelConfig = record
SileroVad: TSherpaOnnxSileroVadModelConfig;
SampleRate: Integer;
NumThreads: Integer;
Provider: AnsiString;
Debug: Boolean;
TenVad: TSherpaOnnxTenVadModelConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
end;
TSherpaOnnxCircularBuffer = class
private
Handle: Pointer;
public
constructor Create(Capacity: Integer);
destructor Destroy; override;
procedure Push(Samples: array of Single); overload;
procedure Push(Samples: pcfloat; N: Integer); overload;
function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
procedure Pop(N: Integer);
procedure Reset;
function Size: Integer;
function Head: Integer;
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxSpeechSegment = record
Samples: array of Single;
Start: Integer;
end;
TSherpaOnnxVoiceActivityDetector = class
private
Handle: Pointer;
_Config: TSherpaOnnxVadModelConfig;
public
constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
destructor Destroy; override;
procedure AcceptWaveform(Samples: array of Single); overload;
procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
function IsEmpty: Boolean;
function IsDetected: Boolean;
procedure Pop;
procedure Clear;
function Front: TSherpaOnnxSpeechSegment;
procedure Reset;
procedure Flush;
property Config: TSherpaOnnxVadModelConfig Read _Config;
property GetHandle: Pointer Read Handle;
end;
TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineSpeakerSegmentationModelConfig = record
Pyannote: TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
end;
TSherpaOnnxFastClusteringConfig = record
NumClusters: Integer;
Threshold: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
end;
TSherpaOnnxSpeakerEmbeddingExtractorConfig = record
Model: AnsiString;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
end;
TSherpaOnnxOfflineSpeakerDiarizationConfig = record
Segmentation: TSherpaOnnxOfflineSpeakerSegmentationModelConfig;
Embedding: TSherpaOnnxSpeakerEmbeddingExtractorConfig;
Clustering: TSherpaOnnxFastClusteringConfig;
MinDurationOn: Single;
MinDurationOff: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
end;
TSherpaOnnxOfflineSpeakerDiarizationSegment = record
Start: Single;
Stop: Single;
Speaker: Integer;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineSpeakerDiarizationSegmentArray = array of TSherpaOnnxOfflineSpeakerDiarizationSegment;
PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = ^TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg;
TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = function(
NumProcessChunks: cint32;
NumTotalChunks: cint32): cint32; cdecl;
TSherpaOnnxOfflineSpeakerDiarization = class
private
Handle: Pointer;
SampleRate: Integer;
_Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
public
constructor Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
destructor Destroy; override;
procedure SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
function Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
function Process(Samples: array of Single; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
property GetHandle: Pointer Read Handle;
property GetSampleRate: Integer Read SampleRate;
end;
TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineSpeechDenoiserModelConfig = record
Gtcrn: TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig);
end;
TSherpaOnnxOfflineSpeechDenoiserConfig = record
Model: TSherpaOnnxOfflineSpeechDenoiserModelConfig;
function ToString: AnsiString;
end;
TSherpaOnnxDenoisedAudio = record
Samples: array of Single;
SampleRate: Integer;
end;
TSherpaOnnxOfflineSpeechDenoiser = class
private
Handle: Pointer;
SampleRate: Integer;
_Config: TSherpaOnnxOfflineSpeechDenoiserConfig;
public
constructor Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig);
destructor Destroy; override;
function Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio;
property GetHandle: Pointer Read Handle;
property GetSampleRate: Integer Read SampleRate;
end;
{ It supports reading a single channel wave with 16-bit encoded samples.
Samples are normalized to the range [-1, 1].
}
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
function SherpaOnnxGetVersionStr(): AnsiString;
function SherpaOnnxGetGitSha1(): AnsiString;
function SherpaOnnxGetGitDate(): AnsiString;
implementation
uses
fpjson,
{ See
- https://wiki.freepascal.org/fcl-json
- https://www.freepascal.org/daily/doc/fcl/fpjson/getjson.html
}
jsonparser,
SysUtils;
const
{
See
- https://www.freepascal.org/docs-html/prog/progap7.html
- https://downloads.freepascal.org/fpc/docs-pdf/
- https://downloads.freepascal.org/fpc/docs-pdf/CinFreePascal.pdf
}
{$if defined(WINDOWS)}
{ For windows, we always use dynamic link. See
https://forum.lazarus.freepascal.org/index.php/topic,15712.msg84781.html#msg84781
We need to rebuild the static lib for windows using Mingw or cygwin
}
SherpaOnnxLibName = 'sherpa-onnx-c-api.dll';
{$elseif not defined(SHERPA_ONNX_USE_SHARED_LIBS)}
{static link for linux and macos}
{$linklib sherpa-onnx-c-api}
{$linklib sherpa-onnx-core}
{$linklib kaldi-decoder-core}
{$linklib sherpa-onnx-kaldifst-core}
{$linklib sherpa-onnx-fstfar}
{$linklib sherpa-onnx-fst}
{$linklib kissfft-float}
{$linklib kaldi-native-fbank-core}
{$linklib piper_phonemize}
{$linklib espeak-ng}
{$linklib ucd}
{$linklib onnxruntime}
{$linklib ssentencepiece_core}
{$ifdef LINUX}
{$linklib m}
{$LINKLIB stdc++}
{$LINKLIB gcc_s}
{$endif}
{$ifdef DARWIN}
{$linklib c++}
{$endif}
SherpaOnnxLibName = '';
{$else}
{dynamic link for linux and macos}
SherpaOnnxLibName = 'sherpa-onnx-c-api';
{$linklib sherpa-onnx-c-api}
{$endif}
type
SherpaOnnxWave = record
Samples: pcfloat;
SampleRate: cint32;
NumSamples: cint32;
end;
PSherpaOnnxWave = ^SherpaOnnxWave;
SherpaOnnxOnlineTransducerModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
Joiner: PAnsiChar;
end;
SherpaOnnxOnlineParaformerModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
end;
SherpaOnnxOnlineZipformer2CtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOnlineNemoCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOnlineToneCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOnlineModelConfig= record
Transducer: SherpaOnnxOnlineTransducerModelConfig;
Paraformer: SherpaOnnxOnlineParaformerModelConfig;
Zipformer2Ctc: SherpaOnnxOnlineZipformer2CtcModelConfig;
Tokens: PAnsiChar;
NumThreads: cint32;
Provider: PAnsiChar;
Debug: cint32;
ModelType: PAnsiChar;
ModelingUnit: PAnsiChar;
BpeVocab: PAnsiChar;
TokensBuf: PAnsiChar;
TokensBufSize: cint32;
NemoCtc: SherpaOnnxOnlineNemoCtcModelConfig;
ToneCtc: SherpaOnnxOnlineToneCtcModelConfig;
end;
SherpaOnnxFeatureConfig = record
SampleRate: cint32;
FeatureDim: cint32;
end;
SherpaOnnxOnlineCtcFstDecoderConfig = record
Graph: PAnsiChar;
MaxActive: cint32;
end;
SherpaOnnxHomophoneReplacerConfig = record
DictDir: PAnsiChar;
Lexicon: PAnsiChar;
RuleFsts: PAnsiChar;
end;
SherpaOnnxOnlineRecognizerConfig = record
FeatConfig: SherpaOnnxFeatureConfig;
ModelConfig: SherpaOnnxOnlineModelConfig;
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
EnableEndpoint: cint32;
Rule1MinTrailingSilence: cfloat;
Rule2MinTrailingSilence: cfloat;
Rule3MinUtteranceLength: cfloat;
HotwordsFile: PAnsiChar;
HotwordsScore: cfloat;
CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: cfloat;
HotwordsBuf: PAnsiChar;
HotwordsBufSize: cint32;
Hr: SherpaOnnxHomophoneReplacerConfig;
end;
PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
SherpaOnnxOfflineTransducerModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
Joiner: PAnsiChar;
end;
SherpaOnnxOfflineParaformerModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineNemoEncDecCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineDolphinModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineZipformerCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineWenetCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineOmnilingualAsrCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineMedAsrCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineWhisperModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
Language: PAnsiChar;
Task: PAnsiChar;
TailPaddings: cint32;
end;
SherpaOnnxOfflineCanaryModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
SrcLang: PAnsiChar;
TgtLang: PAnsiChar;
UsePnc: cint32;
end;
SherpaOnnxOfflineFireRedAsrModelConfig = record
Encoder: PAnsiChar;
Decoder: PAnsiChar;
end;
SherpaOnnxOfflineMoonshineModelConfig = record
Preprocessor: PAnsiChar;
Encoder: PAnsiChar;
UncachedDecoder: PAnsiChar;
CachedDecoder: PAnsiChar;
end;
SherpaOnnxOfflineTdnnModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineLMConfig = record
Model: PAnsiChar;
Scale: cfloat;
end;
SherpaOnnxOfflineSenseVoiceModelConfig = record
Model: PAnsiChar;
Language: PAnsiChar;
UseItn: cint32;
end;
SherpaOnnxOfflineModelConfig = record
Transducer: SherpaOnnxOfflineTransducerModelConfig;
Paraformer: SherpaOnnxOfflineParaformerModelConfig;
NeMoCtc: SherpaOnnxOfflineNemoEncDecCtcModelConfig;
Whisper: SherpaOnnxOfflineWhisperModelConfig;
Tdnn: SherpaOnnxOfflineTdnnModelConfig;
Tokens: PAnsiChar;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
ModelType: PAnsiChar;
ModelingUnit: PAnsiChar;
BpeVocab: PAnsiChar;
TeleSpeechCtc: PAnsiChar;
SenseVoice: SherpaOnnxOfflineSenseVoiceModelConfig;
Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: SherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
Canary: SherpaOnnxOfflineCanaryModelConfig;
WenetCtc: SherpaOnnxOfflineWenetCtcModelConfig;
Omnilingual: SherpaOnnxOfflineOmnilingualAsrCtcModelConfig;
MedAsr: SherpaOnnxOfflineMedAsrCtcModelConfig;
end;
SherpaOnnxOfflineRecognizerConfig = record
FeatConfig: SherpaOnnxFeatureConfig;
ModelConfig: SherpaOnnxOfflineModelConfig;
LMConfig: SherpaOnnxOfflineLMConfig;
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
HotwordsFile: PAnsiChar;
HotwordsScore: cfloat;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: cfloat;
Hr: SherpaOnnxHomophoneReplacerConfig;
end;
PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
SherpaOnnxSileroVadModelConfig = record
Model: PAnsiChar;
Threshold: cfloat;
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxTenVadModelConfig = record
Model: PAnsiChar;
Threshold: cfloat;
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
SampleRate: cint32;
NumThreads: cint32;
Provider: PAnsiChar;
Debug: cint32;
TenVad: SherpaOnnxTenVadModelConfig;
end;
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
SherpaOnnxSpeechSegment = record
Start: cint32;
Samples: pcfloat;
N: cint32;
end;
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
SherpaOnnxOfflineTtsVitsModelConfig = record
Model: PAnsiChar;
Lexicon: PAnsiChar;
Tokens: PAnsiChar;
DataDir: PAnsiChar;
NoiseScale: cfloat;
NoiseScaleW: cfloat;
LengthScale: cfloat;
DictDir: PAnsiChar;
end;
SherpaOnnxOfflineTtsMatchaModelConfig = record
AcousticModel: PAnsiChar;
Vocoder: PAnsiChar;
Lexicon: PAnsiChar;
Tokens: PAnsiChar;
DataDir: PAnsiChar;
NoiseScale: cfloat;
LengthScale: cfloat;
DictDir: PAnsiChar;
end;
SherpaOnnxOfflineTtsKokoroModelConfig = record
Model: PAnsiChar;
Voices: PAnsiChar;
Tokens: PAnsiChar;
DataDir: PAnsiChar;
LengthScale: cfloat;
DictDir: PAnsiChar;
Lexicon: PAnsiChar;
Lang: PAnsiChar;
end;
SherpaOnnxOfflineTtsKittenModelConfig = record
Model: PAnsiChar;
Voices: PAnsiChar;
Tokens: PAnsiChar;
DataDir: PAnsiChar;
LengthScale: cfloat;
end;
SherpaOnnxOfflineTtsZipVoiceModelConfig = record
Tokens: PAnsiChar;
Encoder: PAnsiChar;
Decoder: PAnsiChar;
Vocoder: PAnsiChar;
DataDir: PAnsiChar;
Lexicon: PAnsiChar;
FeatScale: cfloat;
Tshift: cfloat;
TargetRms: cfloat;
GuidanceScale: cfloat;
end;
SherpaOnnxOfflineTtsModelConfig = record
Vits: SherpaOnnxOfflineTtsVitsModelConfig;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
Matcha: SherpaOnnxOfflineTtsMatchaModelConfig;
Kokoro: SherpaOnnxOfflineTtsKokoroModelConfig;
Kitten: SherpaOnnxOfflineTtsKittenModelConfig;
ZipVoice: SherpaOnnxOfflineTtsZipVoiceModelConfig;
end;
SherpaOnnxOfflineTtsConfig = record
Model: SherpaOnnxOfflineTtsModelConfig;
RuleFsts: PAnsiChar;
MaxNumSentences: cint32;
RuleFars: PAnsiChar;
SilenceScale: cfloat;
end;
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
SherpaOnnxGeneratedAudio = record
Samples: pcfloat;
N: cint32;
SampleRate: cint32;
end;
PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio;
SherpaOnnxResampleOut = record
Samples: pcfloat;
N: cint32;
end;
PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineSpeakerSegmentationModelConfig = record
Pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
end;
SherpaOnnxFastClusteringConfig = record
NumClusters: cint32;
Threshold: cfloat;
end;
SherpaOnnxSpeakerEmbeddingExtractorConfig = record
Model: PAnsiChar;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
end;
SherpaOnnxOfflineSpeakerDiarizationConfig = record
Segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig;
Embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig;
Clustering: SherpaOnnxFastClusteringConfig;
MinDurationOn: cfloat;
MinDurationOff: cfloat;
end;
SherpaOnnxOfflineSpeakerDiarizationSegment = record
Start: cfloat;
Stop: cfloat;
Speaker: cint32;
end;
PSherpaOnnxOfflineSpeakerDiarizationSegment = ^SherpaOnnxOfflineSpeakerDiarizationSegment;
PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig;
SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineSpeechDenoiserModelConfig = record
Gtcrn: SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
end;
SherpaOnnxOfflineSpeechDenoiserConfig = record
Model: SherpaOnnxOfflineSpeechDenoiserModelConfig;
end;
PSherpaOnnxOfflineSpeechDenoiserConfig = ^SherpaOnnxOfflineSpeechDenoiserConfig;
SherpaOnnxDenoisedAudio = record
Samples: pcfloat;
N: cint32;
SampleRate: cint32;
end;
PSherpaOnnxDenoisedAudio = ^SherpaOnnxDenoisedAudio;
function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
SampleRateOutHz: cint32;
FilterCutoffHz: cfloat;
NumZeros: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxGetVersionStrWrapper(): PAnsiChar; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxGetVersionStr';
function SherpaOnnxGetGitSha1Wrapper(): PAnsiChar; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxGetGitSha1';
function SherpaOnnxGetGitDateWrapper(): PAnsiChar; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxGetGitDate';
function SherpaOnnxGetVersionStr(): AnsiString;
begin
Result := SherpaOnnxGetVersionStrWrapper();
end;
function SherpaOnnxGetGitSha1(): AnsiString;
begin
Result := SherpaOnnxGetGitSha1Wrapper();
end;
function SherpaOnnxGetGitDate(): AnsiString;
begin
Result := SherpaOnnxGetGitDateWrapper();
end;
procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxLinearResamplerResample(P: Pointer;
Samples: pcfloat;
N: Integer;
Flush: Integer): PSherpaOnnxResampleOut; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineSpeechDenoiser(Config: PSherpaOnnxOfflineSpeechDenoiserConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineSpeechDenoiser(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeechDenoiserGetSampleRate(P: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeechDenoiserRun(P: Pointer;
Samples: pcfloat; N: cint32;SampleRate: cint32):PSherpaOnnxDenoisedAudio; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyDenoisedAudio(Audio: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineSpeakerDiarization(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(P: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOfflineSpeakerDiarizationSetConfig(P: Pointer; Config: PSherpaOnnxOfflineSpeakerDiarizationConfig); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(P: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(P: Pointer): PSherpaOnnxOfflineSpeakerDiarizationSegment; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOfflineSpeakerDiarizationDestroySegment(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeakerDiarizationProcess(P: Pointer; Samples: pcfloat; N: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(P: Pointer;
Samples: pcfloat; N: cint32; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOfflineSpeakerDiarizationDestroyResult(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsGenerate(Tts: Pointer;
Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer;
Text: PAnsiChar; Sid: cint32; Speed: cfloat;
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
BufferSizeInSeconds: cfloat): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOnlineRecognizer(Recognizer: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOnlineStream(Recognizer: Pointer): Pointer; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOnlineStreamWithHotwords(Recognizer: Pointer; Hotwords: PAnsiChar): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOnlineStream(Recognizer: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOnlineStreamAcceptWaveform(Stream: Pointer;
SampleRate: cint32; Samples: pcfloat; N: cint32 ); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOnlineStreamInputFinished(Stream: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxIsOnlineStreamReady(Recognizer: Pointer; Stream: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDecodeOnlineStream(Recognizer: Pointer; Stream: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOnlineStreamReset(Recognizer: Pointer; Stream: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOnlineStreamIsEndpoint(Recognizer: Pointer; Stream: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxGetOnlineStreamResultAsJson(Recognizer: Pointer; Stream: Pointer): PAnsiChar; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOnlineStreamResultJson(PJson: PAnsiChar); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineRecognizer(Config: PSherpaOnnxOfflineRecognizerConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineRecognizer(Recognizer: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineStream(Recognizer: Pointer): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineStream(Stream: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxAcceptWaveformOffline(Stream: Pointer;
SampleRate: cint32; Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDecodeOfflineStream(Recognizer: Pointer; Stream: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxOfflineRecognizerSetConfig(Recognizer: Pointer; Config: PSherpaOnnxOfflineRecognizerConfig); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxGetOfflineStreamResultAsJson(Stream: Pointer): PAnsiChar; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxReadWave';
function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
begin
Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
SampleRate, PAnsiChar(Filename)) = 1;
end;
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
var
PFilename: PAnsiChar;
PWave: PSherpaOnnxWave;
I: Integer;
begin
Result.Samples := nil;
Result.SampleRate := 0;
PFilename := PAnsiChar(Filename);
PWave := SherpaOnnxReadWaveWrapper(PFilename);
if PWave = nil then
Exit;
SetLength(Result.Samples, PWave^.NumSamples);
Result.SampleRate := PWave^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
Result.Samples[I] := PWave^.Samples[I];
SherpaOnnxFreeWaveWrapper(PWave);
end;
function TSherpaOnnxOnlineTransducerModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineTransducerModelConfig(Encoder := %s, Decoder := %s, Joiner := %s)',
[Self.Encoder, Self.Decoder, Self.Joiner]);
end;
function TSherpaOnnxOnlineParaformerModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineParaformerModelConfig(Encoder := %s, Decoder := %s)',
[Self.Encoder, Self.Decoder]);
end;
function TSherpaOnnxOnlineZipformer2CtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineZipformer2CtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOnlineNemoCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineNemoCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOnlineToneCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineToneCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOnlineModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineModelConfig(Transducer := %s, ' +
'Paraformer := %s,' +
'Zipformer2Ctc := %s, ' +
'Tokens := %s, ' +
'NumThreads := %d, ' +
'Provider := %s, ' +
'Debug := %s, ' +
'ModelType := %s, ' +
'ModelingUnit := %s, ' +
'BpeVocab := %s, ' +
'NemoCtc := %s, ' +
'ToneCtc := %s)',
[Self.Transducer.ToString, Self.Paraformer.ToString,
Self.Zipformer2Ctc.ToString, Self.Tokens,
Self.NumThreads, Self.Provider, Self.Debug.ToString,
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
Self.NemoCtc.ToString, Self.ToneCtc.ToString
]);
end;
function TSherpaOnnxFeatureConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxFeatureConfig(SampleRate := %d, FeatureDim := %d)',
[Self.SampleRate, Self.FeatureDim]);
end;
function TSherpaOnnxOnlineCtcFstDecoderConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineCtcFstDecoderConfig(Graph := %s, MaxActive := %d)',
[Self.Graph, Self.MaxActive]);
end;
function TSherpaOnnxHomophoneReplacerConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxHomophoneReplacerConfig(Lexicon := %s, RuleFsts := %s)',
[Self.Lexicon, Self.RuleFsts]);
end;
function TSherpaOnnxOnlineRecognizerConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineRecognizerConfig(FeatConfig := %s, ' +
'ModelConfig := %s, ' +
'DecodingMethod := %s, ' +
'MaxActivePaths := %d, ' +
'EnableEndpoint := %s, ' +
'Rule1MinTrailingSilence := %.1f, ' +
'Rule2MinTrailingSilence := %.1f, ' +
'Rule3MinUtteranceLength := %.1f, ' +
'HotwordsFile := %s, ' +
'HotwordsScore := %.1f, ' +
'CtcFstDecoderConfig := %s, ' +
'RuleFsts := %s, ' +
'RuleFars := %s, ' +
'BlankPenalty := %.1f, ' +
'Hr := %s' +
')'
,
[Self.FeatConfig.ToString, Self.ModelConfig.ToString,
Self.DecodingMethod, Self.MaxActivePaths, Self.EnableEndpoint.ToString,
Self.Rule1MinTrailingSilence, Self.Rule2MinTrailingSilence,
Self.Rule3MinUtteranceLength, Self.HotwordsFile, Self.HotwordsScore,
Self.CtcFstDecoderConfig.ToString, Self.RuleFsts, Self.RuleFars,
Self.BlankPenalty, Self.Hr.ToString
]);
end;
function TSherpaOnnxOnlineRecognizerResult.ToString: AnsiString;
var
TokensStr: AnsiString;
S: AnsiString;
TimestampStr: AnsiString;
T: Single;
Sep: AnsiString;
begin
TokensStr := '[';
Sep := '';
for S in Self.Tokens do
begin
TokensStr := TokensStr + Sep + S;
Sep := ', ';
end;
TokensStr := TokensStr + ']';
TimestampStr := '[';
Sep := '';
for T in Self.Timestamps do
begin
TimestampStr := TimestampStr + Sep + Format('%.2f', [T]);
Sep := ', ';
end;
TimestampStr := TimestampStr + ']';
Result := Format('TSherpaOnnxOnlineRecognizerResult(Text := %s, ' +
'Tokens := %s, ' +
'Timestamps := %s' +
')',
[Self.Text, TokensStr, TimestampStr]);
end;
constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecognizerConfig);
var
C: SherpaOnnxOnlineRecognizerConfig;
begin
C := Default(SherpaOnnxOnlineRecognizerConfig);
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
C.ModelConfig.Transducer.Encoder := PAnsiChar(Config.ModelConfig.Transducer.Encoder);
C.ModelConfig.Transducer.Decoder := PAnsiChar(Config.ModelConfig.Transducer.Decoder);
C.ModelConfig.Transducer.Joiner := PAnsiChar(Config.ModelConfig.Transducer.Joiner);
C.ModelConfig.Paraformer.Encoder := PAnsiChar(Config.ModelConfig.Paraformer.Encoder);
C.ModelConfig.Paraformer.Decoder := PAnsiChar(Config.ModelConfig.Paraformer.Decoder);
C.ModelConfig.Zipformer2Ctc.Model := PAnsiChar(Config.ModelConfig.Zipformer2Ctc.Model);
C.ModelConfig.NemoCtc.Model := PAnsiChar(Config.ModelConfig.NemoCtc.Model);
C.ModelConfig.ToneCtc.Model := PAnsiChar(Config.ModelConfig.ToneCtc.Model);
C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
C.ModelConfig.Provider := PAnsiChar(Config.ModelConfig.Provider);
C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug);
C.ModelConfig.ModelType := PAnsiChar(Config.ModelConfig.ModelType);
C.ModelConfig.ModelingUnit := PAnsiChar(Config.ModelConfig.ModelingUnit);
C.ModelConfig.BpeVocab := PAnsiChar(Config.ModelConfig.BpeVocab);
C.DecodingMethod := PAnsiChar(Config.DecodingMethod);
C.MaxActivePaths := Config.MaxActivePaths;
C.EnableEndpoint := Ord(Config.EnableEndpoint);
C.Rule1MinTrailingSilence := Config.Rule1MinTrailingSilence;
C.Rule2MinTrailingSilence := Config.Rule2MinTrailingSilence;
C.Rule3MinUtteranceLength := Config.Rule3MinUtteranceLength;
C.HotwordsFile := PAnsiChar(Config.HotwordsFile);
C.HotwordsScore := Config.HotwordsScore;
C.CtcFstDecoderConfig.Graph := PAnsiChar(Config.CtcFstDecoderConfig.Graph);
C.CtcFstDecoderConfig.MaxActive := Config.CtcFstDecoderConfig.MaxActive;
C.RuleFsts := PAnsiChar(Config.RuleFsts);
C.RuleFars := PAnsiChar(Config.RuleFars);
C.BlankPenalty := Config.BlankPenalty;
C.Hr.Lexicon := PAnsiChar(Config.Hr.Lexicon);
C.Hr.RuleFsts := PAnsiChar(Config.Hr.RuleFsts);
Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
Self._Config := Config;
end;
destructor TSherpaOnnxOnlineRecognizer.Destroy;
begin
SherpaOnnxDestroyOnlineRecognizer(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxOnlineRecognizer.CreateStream: TSherpaOnnxOnlineStream;
var
Stream: Pointer;
begin
Stream := SherpaOnnxCreateOnlineStream(Self.Handle);
Result := TSherpaOnnxOnlineStream.Create(Stream);
end;
function TSherpaOnnxOnlineRecognizer.CreateStream(Hotwords: AnsiString): TSherpaOnnxOnlineStream;
var
Stream: Pointer;
begin
Stream := SherpaOnnxCreateOnlineStreamWithHotwords(Self.Handle, PAnsiChar(Hotwords));
Result := TSherpaOnnxOnlineStream.Create(Stream);
end;
function TSherpaOnnxOnlineRecognizer.IsReady(Stream: TSherpaOnnxOnlineStream): Boolean;
begin
Result := SherpaOnnxIsOnlineStreamReady(Self.Handle, Stream.Handle) = 1;
end;
procedure TSherpaOnnxOnlineRecognizer.Decode(Stream: TSherpaOnnxOnlineStream);
begin
SherpaOnnxDecodeOnlineStream(Self.Handle, Stream.Handle);
end;
procedure TSherpaOnnxOnlineRecognizer.Reset(Stream: TSherpaOnnxOnlineStream);
begin
SherpaOnnxOnlineStreamReset(Self.Handle, Stream.Handle);
end;
function TSherpaOnnxOnlineRecognizer.IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
begin
Result := SherpaOnnxOnlineStreamIsEndpoint(Self.Handle, Stream.Handle) = 1;
end;
function TSherpaOnnxOnlineRecognizer.GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
var
pJson: PAnsiChar;
JsonData: TJSONData;
JsonObject : TJSONObject;
JsonEnum: TJSONEnum;
I: Integer;
begin
pJson := SherpaOnnxGetOnlineStreamResultAsJson(Self.Handle, Stream.Handle);
{
- https://www.freepascal.org/daily/doc/fcl/fpjson/getjson.html
- https://www.freepascal.org/daily/doc/fcl/fpjson/tjsondata.html
- https://www.freepascal.org/daily/doc/fcl/fpjson/tjsonobject.html
- https://www.freepascal.org/daily/doc/fcl/fpjson/tjsonenum.html
}
JsonData := GetJSON(AnsiString(pJson), False);
JsonObject := JsonData as TJSONObject;
Result.Text := JsonObject.Strings['text'];
SetLength(Result.Tokens, JsonObject.Arrays['tokens'].Count);
I := 0;
for JsonEnum in JsonObject.Arrays['tokens'] do
begin
Result.Tokens[I] := JsonEnum.Value.AsString;
Inc(I);
end;
SetLength(Result.Timestamps, JsonObject.Arrays['timestamps'].Count);
I := 0;
for JsonEnum in JsonObject.Arrays['timestamps'] do
begin
Result.Timestamps[I] := JsonEnum.Value.AsFloat;
Inc(I);
end;
SherpaOnnxDestroyOnlineStreamResultJson(pJson);
end;
constructor TSherpaOnnxOnlineStream.Create(P: Pointer);
begin
Self.Handle := P;
end;
destructor TSherpaOnnxOnlineStream.Destroy;
begin
SherpaOnnxDestroyOnlineStream(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxOnlineStream.AcceptWaveform(Samples: array of Single; SampleRate: Integer);
begin
SherpaOnnxOnlineStreamAcceptWaveform(Self.Handle, SampleRate,
pcfloat(Samples), Length(Samples));
end;
procedure TSherpaOnnxOnlineStream.InputFinished;
begin
SherpaOnnxOnlineStreamInputFinished(Self.Handle);
end;
function TSherpaOnnxOfflineTransducerModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTransducerModelConfig(' +
'Encoder := %s, ' +
'Decoder := %s, ' +
'Joiner := %s' +
')',
[Self.Encoder, Self.Decoder, Self.Joiner]);
end;
function TSherpaOnnxOfflineParaformerModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineParaformerModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineNemoEncDecCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineNemoEncDecCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineDolphinModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineDolphinModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineWenetCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineWenetCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineOmnilingualAsrCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineOmnilingualAsrCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineMedAsrCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineMedAsrCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
'Encoder := %s, ' +
'Decoder := %s, ' +
'Language := %s, ' +
'Task := %s, ' +
'TailPaddings := %d' +
')',
[Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]);
end;
function TSherpaOnnxOfflineCanaryModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineCanaryModelConfig(' +
'Encoder := %s, ' +
'Decoder := %s, ' +
'SrcLang := %s, ' +
'TgtLang := %s, ' +
'UsePnc := %s' +
')',
[Self.Encoder, Self.Decoder, Self.SrcLang,
Self.TgtLang, Self.UsePnc.ToString]);
end;
function TSherpaOnnxOfflineFireRedAsrModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineFireRedAsrModelConfig(' +
'Encoder := %s, ' +
'Decoder := %s)',
[Self.Encoder, Self.Decoder]);
end;
function TSherpaOnnxOfflineMoonshineModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineMoonshineModelConfig(' +
'Preprocessor := %s, ' +
'Encoder := %s, ' +
'UncachedDecoder := %s, ' +
'CachedDecoder := %s)',
[Self.Preprocessor, Self.Encoder, Self.UncachedDecoder, Self.CachedDecoder]);
end;
function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineLMConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineLMConfig(' +
'Model := %s, ' +
'Scale := %.1f' +
')',
[Self.Model, Self.Scale]);
end;
function TSherpaOnnxOfflineSenseVoiceModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSenseVoiceModelConfig(' +
'Model := %s, ' +
'Language := %s, ' +
'UseItn := %s' +
')',
[Self.Model, Self.Language, Self.UseItn.ToString]);
end;
function TSherpaOnnxOfflineModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineModelConfig(' +
'Transducer := %s, ' +
'Paraformer := %s, ' +
'NeMoCtc := %s, ' +
'Whisper := %s, ' +
'Tdnn := %s, ' +
'Tokens := %s, ' +
'NumThreads := %d, ' +
'Debug := %s, ' +
'Provider := %s, ' +
'ModelType := %s, ' +
'ModelingUnit := %s, ' +
'BpeVocab := %s, ' +
'TeleSpeechCtc := %s, ' +
'SenseVoice := %s, ' +
'Moonshine := %s, ' +
'FireRedAsr := %s, ' +
'Dolphin := %s, ' +
'ZipformerCtc := %s, ' +
'Canary := %s, ' +
'WenetCtc := %s, ' +
'Omnilingual := %s, ' +
'MedAsr := %s' +
')',
[Self.Transducer.ToString, Self.Paraformer.ToString,
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
Self.FireRedAsr.ToString, Self.Dolphin.ToString,
Self.ZipformerCtc.ToString, Self.Canary.ToString, Self.WenetCtc.ToString,
Self.Omnilingual.ToString, Self.MedAsr.ToString
]);
end;
function TSherpaOnnxOfflineRecognizerConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineRecognizerConfig(' +
'FeatConfig := %s, ' +
'ModelConfig := %s, ' +
'LMConfig := %s, ' +
'DecodingMethod := %s, ' +
'MaxActivePaths := %d, ' +
'HotwordsFile := %s, ' +
'HotwordsScore := %.1f, ' +
'RuleFsts := %s, ' +
'RuleFars := %s, ' +
'BlankPenalty := %1.f, ' +
'Hr := %s' +
')',
[Self.FeatConfig.ToString, Self.ModelConfig.ToString,
Self.LMConfig.ToString, Self.DecodingMethod, Self.MaxActivePaths,
Self.HotwordsFile, Self.HotwordsScore, Self.RuleFsts, Self.RuleFars,
Self.BlankPenalty, Self.Hr.ToString
]);
end;
function ConvertOfflineRecognizerConfig(Config: TSherpaOnnxOfflineRecognizerConfig): SherpaOnnxOfflineRecognizerConfig;
var
C: SherpaOnnxOfflineRecognizerConfig;
begin
C := Default(SherpaOnnxOfflineRecognizerConfig);
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
C.ModelConfig.Transducer.Encoder := PAnsiChar(Config.ModelConfig.Transducer.Encoder);
C.ModelConfig.Transducer.Decoder := PAnsiChar(Config.ModelConfig.Transducer.Decoder);
C.ModelConfig.Transducer.Joiner := PAnsiChar(Config.ModelConfig.Transducer.Joiner);
C.ModelConfig.Paraformer.Model := PAnsiChar(Config.ModelConfig.Paraformer.Model);
C.ModelConfig.NeMoCtc.Model := PAnsiChar(Config.ModelConfig.NeMoCtc.Model);
C.ModelConfig.Whisper.Encoder := PAnsiChar(Config.ModelConfig.Whisper.Encoder);
C.ModelConfig.Whisper.Decoder := PAnsiChar(Config.ModelConfig.Whisper.Decoder);
C.ModelConfig.Whisper.Language := PAnsiChar(Config.ModelConfig.Whisper.Language);
C.ModelConfig.Whisper.Task := PAnsiChar(Config.ModelConfig.Whisper.Task);
C.ModelConfig.Whisper.TailPaddings := Config.ModelConfig.Whisper.TailPaddings;
C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model);
C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug);
C.ModelConfig.Provider := PAnsiChar(Config.ModelConfig.Provider);
C.ModelConfig.ModelType := PAnsiChar(Config.ModelConfig.ModelType);
C.ModelConfig.ModelingUnit := PAnsiChar(Config.ModelConfig.ModelingUnit);
C.ModelConfig.BpeVocab := PAnsiChar(Config.ModelConfig.BpeVocab);
C.ModelConfig.TeleSpeechCtc := PAnsiChar(Config.ModelConfig.TeleSpeechCtc);
C.ModelConfig.SenseVoice.Model := PAnsiChar(Config.ModelConfig.SenseVoice.Model);
C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language);
C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn);
C.ModelConfig.Moonshine.Preprocessor := PAnsiChar(Config.ModelConfig.Moonshine.Preprocessor);
C.ModelConfig.Moonshine.Encoder := PAnsiChar(Config.ModelConfig.Moonshine.Encoder);
C.ModelConfig.Moonshine.UncachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.UncachedDecoder);
C.ModelConfig.Moonshine.CachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.CachedDecoder);
C.ModelConfig.FireRedAsr.Encoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Encoder);
C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
C.ModelConfig.Canary.Encoder := PAnsiChar(Config.ModelConfig.Canary.Encoder);
C.ModelConfig.Canary.Decoder := PAnsiChar(Config.ModelConfig.Canary.Decoder);
C.ModelConfig.Canary.SrcLang := PAnsiChar(Config.ModelConfig.Canary.SrcLang);
C.ModelConfig.Canary.TgtLang := PAnsiChar(Config.ModelConfig.Canary.TgtLang);
C.ModelConfig.Canary.UsePnc := Ord(Config.ModelConfig.Canary.UsePnc);
C.ModelConfig.WenetCtc.Model := PAnsiChar(Config.ModelConfig.WenetCtc.Model);
C.ModelConfig.Omnilingual.Model := PAnsiChar(Config.ModelConfig.Omnilingual.Model);
C.ModelConfig.MedAsr.Model := PAnsiChar(Config.ModelConfig.MedAsr.Model);
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
C.LMConfig.Scale := Config.LMConfig.Scale;
C.DecodingMethod := PAnsiChar(Config.DecodingMethod);
C.MaxActivePaths := Config.MaxActivePaths;
C.HotwordsFile := PAnsiChar(Config.HotwordsFile);
C.HotwordsScore := Config.HotwordsScore;
C.RuleFsts := PAnsiChar(Config.RuleFsts);
C.RuleFars := PAnsiChar(Config.RuleFars);
C.BlankPenalty := Config.BlankPenalty;
C.Hr.Lexicon := PAnsiChar(Config.Hr.Lexicon);
C.Hr.RuleFsts := PAnsiChar(Config.Hr.RuleFsts);
Result := C;
end;
constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecognizerConfig);
var
C: SherpaOnnxOfflineRecognizerConfig;
begin
C := ConvertOfflineRecognizerConfig(Config);
Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
Self._Config := Config;
end;
procedure TSherpaOnnxOfflineRecognizer.SetConfig(Config: TSherpaOnnxOfflineRecognizerConfig);
var
C: SherpaOnnxOfflineRecognizerConfig;
begin
C := ConvertOfflineRecognizerConfig(Config);
SherpaOnnxOfflineRecognizerSetConfig(Self.Handle, @C);
{ We don't update Self._Config }
end;
destructor TSherpaOnnxOfflineRecognizer.Destroy;
begin
SherpaOnnxDestroyOfflineRecognizer(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxOfflineRecognizer.CreateStream: TSherpaOnnxOfflineStream;
var
Stream: Pointer;
begin
Stream := SherpaOnnxCreateOfflineStream(Self.Handle);
Result := TSherpaOnnxOfflineStream.Create(Stream);
end;
procedure TSherpaOnnxOfflineRecognizer.Decode(Stream: TSherpaOnnxOfflineStream);
begin
SherpaOnnxDecodeOfflineStream(Self.Handle, Stream.Handle);
end;
function TSherpaOnnxOfflineRecognizer.GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
var
pJson: PAnsiChar;
JsonData: TJSONData;
JsonObject : TJSONObject;
JsonEnum: TJSONEnum;
I: Integer;
begin
pJson := SherpaOnnxGetOfflineStreamResultAsJson(Stream.Handle);
JsonData := GetJSON(AnsiString(pJson), False);
JsonObject := JsonData as TJSONObject;
Result.Text := JsonObject.Strings['text'];
SetLength(Result.Tokens, JsonObject.Arrays['tokens'].Count);
I := 0;
for JsonEnum in JsonObject.Arrays['tokens'] do
begin
Result.Tokens[I] := JsonEnum.Value.AsString;
Inc(I);
end;
SetLength(Result.Timestamps, JsonObject.Arrays['timestamps'].Count);
I := 0;
for JsonEnum in JsonObject.Arrays['timestamps'] do
begin
Result.Timestamps[I] := JsonEnum.Value.AsFloat;
Inc(I);
end;
SherpaOnnxDestroyOfflineStreamResultJson(pJson);
end;
constructor TSherpaOnnxOfflineStream.Create(P: Pointer);
begin
Self.Handle := P;
end;
destructor TSherpaOnnxOfflineStream.Destroy;
begin
SherpaOnnxDestroyOfflineStream(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxOfflineStream.AcceptWaveform(Samples: array of Single; SampleRate: Integer);
begin
SherpaOnnxAcceptWaveformOffline(Self.Handle, SampleRate, pcfloat(Samples),
Length(Samples));
end;
function TSherpaOnnxOfflineRecognizerResult.ToString: AnsiString;
var
TokensStr: AnsiString;
S: AnsiString;
TimestampStr: AnsiString;
T: Single;
Sep: AnsiString;
begin
TokensStr := '[';
Sep := '';
for S in Self.Tokens do
begin
TokensStr := TokensStr + Sep + S;
Sep := ', ';
end;
TokensStr := TokensStr + ']';
TimestampStr := '[';
Sep := '';
for T in Self.Timestamps do
begin
TimestampStr := TimestampStr + Sep + Format('%.2f', [T]);
Sep := ', ';
end;
TimestampStr := TimestampStr + ']';
Result := Format('TSherpaOnnxOfflineRecognizerResult(Text := %s, ' +
'Tokens := %s, ' +
'Timestamps := %s' +
')',
[Self.Text, TokensStr, TimestampStr]);
end;
function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxSileroVadModelConfig(' +
'Model := %s, ' +
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d, ' +
'MaxSpeechDuration := %.2f' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
]);
end;
function TSherpaOnnxTenVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxTenVadModelConfig(' +
'Model := %s, ' +
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d, ' +
'MaxSpeechDuration := %.2f' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
]);
end;
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
begin
Dest.Threshold := 0.5;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 512;
Dest.MaxSpeechDuration := 5.0;
end;
class operator TSherpaOnnxTenVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
begin
Dest.Threshold := 0.5;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 256;
Dest.MaxSpeechDuration := 5.0;
end;
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxVadModelConfig(' +
'SileroVad := %s, ' +
'SampleRate := %d, ' +
'NumThreads := %d, ' +
'Provider := %s, ' +
'Debug := %s, ' +
'TenVad := %s' +
')',
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
Self.Debug.ToString, Self.TenVad.ToString
]);
end;
class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
begin
Dest.SampleRate := 16000;
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
begin
Dest.SampleRate := 16000;
Dest.FeatureDim := 80;
end;
class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
begin
Dest.MaxActive := 3000;
end;
class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.EnableEndpoint := False;
Dest.Rule1MinTrailingSilence := 2.4;
Dest.Rule2MinTrailingSilence := 1.2;
Dest.Rule3MinUtteranceLength := 20;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
begin
Dest.Task := 'transcribe';
Dest.TailPaddings := -1;
end;
class operator TSherpaOnnxOfflineCanaryModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineCanaryModelConfig);
begin
Dest.SrcLang := 'en';
Dest.TgtLang := 'en';
Dest.UsePnc := True;
end;
class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
begin
Dest.Scale := 1.0;
end;
class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
begin
Dest.UseItn := True;
end;
class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.MaxActivePaths := 4;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
begin
Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
end;
destructor TSherpaOnnxCircularBuffer.Destroy;
begin
SherpaOnnxDestroyCircularBuffer(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
begin
SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
end;
procedure TSherpaOnnxCircularBuffer.Push(Samples: pcfloat; N: Integer);
begin
SherpaOnnxCircularBufferPush(Self.Handle, Samples, N);
end;
function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
var
P: pcfloat;
I: Integer;
begin
P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
Result := nil;
SetLength(Result, N);
for I := Low(Result) to High(Result) do
Result[I] := P[I];
SherpaOnnxCircularBufferFree(P);
end;
procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
begin
SherpaOnnxCircularBufferPop(Self.Handle, N);
end;
procedure TSherpaOnnxCircularBuffer.Reset;
begin
SherpaOnnxCircularBufferReset(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Size: Integer;
begin
Result := SherpaOnnxCircularBufferSize(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Head: Integer;
begin
Result := SherpaOnnxCircularBufferHead(Self.Handle);
end;
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
var
C: SherpaOnnxVadModelConfig ;
begin
C := Default(SherpaOnnxVadModelConfig);
Self._Config := Config;
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
C.SileroVad.Threshold := Config.SileroVad.Threshold;
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
C.TenVad.Model := PAnsiChar(Config.TenVad.Model);
C.TenVad.Threshold := Config.TenVad.Threshold;
C.TenVad.MinSilenceDuration := Config.TenVad.MinSilenceDuration;
C.TenVad.MinSpeechDuration := Config.TenVad.MinSpeechDuration;
C.TenVad.WindowSize := Config.TenVad.WindowSize;
C.TenVad.MaxSpeechDuration := Config.TenVad.MaxSpeechDuration;
C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
C.Provider := PAnsiChar(Config.Provider);
C.Debug := Ord(Config.Debug);
Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
end;
destructor TSherpaOnnxVoiceActivityDetector.Destroy;
begin
SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
begin
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
begin
if Offset + N > Length(Samples) then
begin
WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
[Length(Samples), Offset, N]
));
Exit;
end;
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
pcfloat(Samples) + Offset, N);
end;
function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
end;
function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
end;
procedure TSherpaOnnxVoiceActivityDetector.Pop;
begin
SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Clear;
begin
SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
end;
function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
var
P: PSherpaOnnxSpeechSegment;
I: Integer;
begin
P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
Result.Start := P^.Start;
Result.Samples := nil;
SetLength(Result.Samples, P^.N);
for I := Low(Result.Samples) to High(Result.Samples) do
Result.Samples[I] := P^.Samples[I];
SherpaOnnxDestroySpeechSegment(P);
end;
procedure TSherpaOnnxVoiceActivityDetector.Reset;
begin
SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Flush;
begin
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
end;
function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' +
'Model := %s, ' +
'Lexicon := %s, ' +
'Tokens := %s, ' +
'DataDir := %s, ' +
'NoiseScale := %.2f, ' +
'NoiseScaleW := %.2f, ' +
'LengthScale := %.2f' +
')',
[Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale,
Self.NoiseScaleW, Self.LengthScale
]);
end;
class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
begin
Dest.NoiseScale := 0.667;
Dest.NoiseScaleW := 0.8;
Dest.LengthScale := 1.0;
end;
function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsMatchaModelConfig(' +
'AcousticModel := %s, ' +
'Vocoder := %s, ' +
'Lexicon := %s, ' +
'Tokens := %s, ' +
'DataDir := %s, ' +
'NoiseScale := %.2f, ' +
'LengthScale := %.2f' +
')',
[Self.AcousticModel, Self.Vocoder, Self.Lexicon, Self.Tokens,
Self.DataDir, Self.NoiseScale, Self.LengthScale
]);
end;
class operator TSherpaOnnxOfflineTtsMatchaModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
begin
Dest.NoiseScale := 0.667;
Dest.LengthScale := 1.0;
end;
function TSherpaOnnxOfflineTtsKokoroModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsKokoroModelConfig(' +
'Model := %s, ' +
'Voices := %s, ' +
'Tokens := %s, ' +
'DataDir := %s, ' +
'LengthScale := %.2f, ' +
'Lexicon := %s, ' +
'Lang := %s' +
')',
[Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale,
Self.Lexicon, Self.Lang]);
end;
class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
begin
Dest.LengthScale := 1.0;
end;
function TSherpaOnnxOfflineTtsKittenModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsKittenModelConfig(' +
'Model := %s, ' +
'Voices := %s, ' +
'Tokens := %s, ' +
'DataDir := %s, ' +
'LengthScale := %.2f' +
')',
[Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale]);
end;
class operator TSherpaOnnxOfflineTtsKittenModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKittenModelConfig);
begin
Dest.LengthScale := 1.0;
end;
function TSherpaOnnxOfflineTtsZipVoiceModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsZipVoiceModelConfig(' +
'Tokens := %s, ' +
'Encoder := %s, ' +
'Decoder := %s, ' +
'Vocoder := %s, ' +
'DataDir := %s, ' +
'Lexicon := %s, ' +
'FeatScale := %.2f, ' +
'Tshift := %.2f, ' +
'TargetRms := %.2f, ' +
'GuidanceScale := %.2f' +
')',
[Self.Tokens, Self.Encoder, Self.Decoder, Self.Vocoder,
Self.DataDir, Self.Lexicon, Self.FeatScale, Self.Tshift,
Self.TargetRms, Self.GuidanceScale]);
end;
class operator TSherpaOnnxOfflineTtsZipVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsZipVoiceModelConfig);
begin
Dest.FeatScale := 0.1;
Dest.Tshift := 0.5;
Dest.TargetRms := 0.1;
Dest.GuidanceScale := 1.0;
end;
function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
'Vits := %s, ' +
'NumThreads := %d, ' +
'Debug := %s, ' +
'Provider := %s, ' +
'Matcha := %s, ' +
'Kokoro := %s, ' +
'Kitten := %s, ' +
'ZipVoice := %s' +
')',
[Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider,
Self.Matcha.ToString, Self.Kokoro.ToString, Self.Kitten.ToString,
Self.ZipVoice.ToString
]);
end;
class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsConfig(' +
'Model := %s, ' +
'RuleFsts := %s, ' +
'MaxNumSentences := %d, ' +
'RuleFars := %s, ' +
'SilenceScale := %f' +
')',
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars,
Self.SilenceScale]);
end;
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
begin
Dest.MaxNumSentences := 1;
Dest.SilenceScale := 0.2;
end;
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
var
C: SherpaOnnxOfflineTtsConfig;
begin
C := Default(SherpaOnnxOfflineTtsConfig);
Self._Config := Config;
C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model);
C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon);
C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens);
C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir);
C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale;
C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW;
C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
C.Model.Matcha.AcousticModel := PAnsiChar(Config.Model.Matcha.AcousticModel);
C.Model.Matcha.Vocoder := PAnsiChar(Config.Model.Matcha.Vocoder);
C.Model.Matcha.Lexicon := PAnsiChar(Config.Model.Matcha.Lexicon);
C.Model.Matcha.Tokens := PAnsiChar(Config.Model.Matcha.Tokens);
C.Model.Matcha.DataDir := PAnsiChar(Config.Model.Matcha.DataDir);
C.Model.Matcha.NoiseScale := Config.Model.Matcha.NoiseScale;
C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale;
C.Model.Kokoro.Model := PAnsiChar(Config.Model.Kokoro.Model);
C.Model.Kokoro.Voices := PAnsiChar(Config.Model.Kokoro.Voices);
C.Model.Kokoro.Tokens := PAnsiChar(Config.Model.Kokoro.Tokens);
C.Model.Kokoro.DataDir := PAnsiChar(Config.Model.Kokoro.DataDir);
C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale;
C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon);
C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang);
C.Model.Kitten.Model := PAnsiChar(Config.Model.Kitten.Model);
C.Model.Kitten.Voices := PAnsiChar(Config.Model.Kitten.Voices);
C.Model.Kitten.Tokens := PAnsiChar(Config.Model.Kitten.Tokens);
C.Model.Kitten.DataDir := PAnsiChar(Config.Model.Kitten.DataDir);
C.Model.Kitten.LengthScale := Config.Model.Kitten.LengthScale;
C.Model.ZipVoice.Tokens := PAnsiChar(Config.Model.ZipVoice.Tokens);
C.Model.ZipVoice.Encoder := PAnsiChar(Config.Model.ZipVoice.Encoder);
C.Model.ZipVoice.Decoder := PAnsiChar(Config.Model.ZipVoice.Decoder);
C.Model.ZipVoice.Vocoder := PAnsiChar(Config.Model.ZipVoice.Vocoder);
C.Model.ZipVoice.DataDir := PAnsiChar(Config.Model.ZipVoice.DataDir);
C.Model.ZipVoice.Lexicon := PAnsiChar(Config.Model.ZipVoice.Lexicon);
C.Model.ZipVoice.FeatScale := Config.Model.ZipVoice.FeatScale;
C.Model.ZipVoice.Tshift := Config.Model.ZipVoice.Tshift;
C.Model.ZipVoice.TargetRms := Config.Model.ZipVoice.TargetRms;
C.Model.ZipVoice.GuidanceScale := Config.Model.ZipVoice.GuidanceScale;
C.Model.NumThreads := Config.Model.NumThreads;
C.Model.Provider := PAnsiChar(Config.Model.Provider);
C.Model.Debug := Ord(Config.Model.Debug);
C.RuleFsts := PAnsiChar(Config.RuleFsts);
C.MaxNumSentences := Config.MaxNumSentences;
C.RuleFars := PAnsiChar(Config.RuleFars);
C.SilenceScale := Config.SilenceScale;
Self.Handle := SherpaOnnxCreateOfflineTts(@C);
Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle);
Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle);
end;
destructor TSherpaOnnxOfflineTts.Destroy;
begin
SherpaOnnxDestroyOfflineTts(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single): TSherpaOnnxGeneratedAudio;
var
Audio: PSherpaOnnxGeneratedAudio;
I: Integer;
begin
Result := Default(TSherpaOnnxGeneratedAudio);
Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed);
SetLength(Result.Samples, Audio^.N);
Result.SampleRate := Audio^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
begin
Result.Samples[I] := Audio^.Samples[I];
end;
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
end;
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single;
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer
): TSherpaOnnxGeneratedAudio;
var
Audio: PSherpaOnnxGeneratedAudio;
I: Integer;
begin
Result := Default(TSherpaOnnxGeneratedAudio);
Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text),
SpeakerId, Speed, Callback, Arg);
SetLength(Result.Samples, Audio^.N);
Result.SampleRate := Audio^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
begin
Result.Samples[I] := Audio^.Samples[I];
end;
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
end;
constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer);
var
MinFreq: Single;
LowpassCutoff: Single;
LowpassFilterWidth: Integer = 6;
begin
if SampleRateIn > SampleRateOut then
MinFreq := SampleRateOut
else
MinFreq := SampleRateIn;
LowpassCutoff := 0.99 * 0.5 * MinFreq;
Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn,
SampleRateOut, LowpassCutoff, LowpassFilterWidth);
Self.InputSampleRate := SampleRateIn;
Self.OutputSampleRate := SampleRateOut;
end;
destructor TSherpaOnnxLinearResampler.Destroy;
begin
SherpaOnnxDestroyLinearResampler(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat;
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray;
var
P: PSherpaOnnxResampleOut;
I: Integer;
begin
Result := Default(TSherpaOnnxSamplesArray);
P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush));
SetLength(Result, P^.N);
for I := Low(Result) to High(Result) do
Result[I] := P^.Samples[I];
SherpaOnnxLinearResamplerResampleFree(P);
end;
function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray;
begin
Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush);
end;
procedure TSherpaOnnxLinearResampler.Reset;
begin
SherpaOnnxLinearResamplerReset(Self.Handle);
end;
function TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
'Model := %s)',[Self.Model]);
end;
function TSherpaOnnxOfflineSpeakerSegmentationModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
'Pyannote := %s, ' +
'NumThreads := %d, ' +
'Debug := %s, ' +
'Provider := %s)',
[Self.Pyannote.ToString, Self.NumThreads,
Self.Debug.ToString, Self.Provider]);
end;
class operator TSherpaOnnxOfflineSpeakerSegmentationModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
function TSherpaOnnxFastClusteringConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxFastClusteringConfig(' +
'NumClusters := %d, Threshold := %.3f)',
[Self.NumClusters, Self.Threshold]);
end;
class operator TSherpaOnnxFastClusteringConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
begin
Dest.NumClusters := -1;
Dest.Threshold := 0.5;
end;
function TSherpaOnnxSpeakerEmbeddingExtractorConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxSpeakerEmbeddingExtractorConfig(' +
'Model := %s, '+
'NumThreads := %d, '+
'Debug := %s, '+
'Provider := %s)',
[Self.Model, Self.NumThreads, Self.Debug.ToString, Self.Provider]);
end;
class operator TSherpaOnnxSpeakerEmbeddingExtractorConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
function TSherpaOnnxOfflineSpeakerDiarizationConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeakerDiarizationConfig(' +
'Segmentation := %s, '+
'Embedding := %s, '+
'Clustering := %s, '+
'MinDurationOn := %.3f, '+
'MinDurationOff := %.3f)',
[Self.Segmentation.ToString, Self.Embedding.ToString,
Self.Clustering.ToString, Self.MinDurationOn, Self.MinDurationOff]);
end;
class operator TSherpaOnnxOfflineSpeakerDiarizationConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
begin
Dest.MinDurationOn := 0.2;
Dest.MinDurationOff := 0.5;
end;
function TSherpaOnnxOfflineSpeakerDiarizationSegment.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeakerDiarizationSegment(' +
'Start := %.3f, '+
'Stop := %.3f, '+
'Speaker := %d)',
[Self.Start, Self.Stop, Self.Speaker]);
end;
constructor TSherpaOnnxOfflineSpeakerDiarization.Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
var
C: SherpaOnnxOfflineSpeakerDiarizationConfig;
begin
C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
C.Segmentation.Pyannote.Model := PAnsiChar(Config.Segmentation.Pyannote.Model);
C.Segmentation.NumThreads := Config.Segmentation.NumThreads;
C.Segmentation.Debug := Ord(Config.Segmentation.Debug);
C.Segmentation.Provider := PAnsiChar(Config.Segmentation.Provider);
C.Embedding.Model := PAnsiChar(Config.Embedding.Model);
C.Embedding.NumThreads := Config.Embedding.NumThreads;
C.Embedding.Debug := Ord(Config.Embedding.Debug);
C.Embedding.Provider := PAnsiChar(Config.Embedding.Provider);
C.Clustering.NumClusters := Config.Clustering.NumClusters;
C.Clustering.Threshold := Config.Clustering.Threshold;
C.MinDurationOn := Config.MinDurationOn;
C.MinDurationOff := Config.MinDurationOff;
Self.Handle := SherpaOnnxCreateOfflineSpeakerDiarization(@C);
Self._Config := Config;
Self.SampleRate := 0;
if Self.Handle <> nil then
begin
Self.SampleRate := SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(Self.Handle);
end;
end;
destructor TSherpaOnnxOfflineSpeakerDiarization.Destroy;
begin
SherpaOnnxDestroyOfflineSpeakerDiarization(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxOfflineSpeakerDiarization.SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
var
C: SherpaOnnxOfflineSpeakerDiarizationConfig;
begin
C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
C.Clustering.NumClusters := Config.Clustering.NumClusters;
C.Clustering.Threshold := Config.Clustering.Threshold;
SherpaOnnxOfflineSpeakerDiarizationSetConfig(Self.Handle, @C);
end;
function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
var
R: Pointer;
NumSegments: Integer;
I: Integer;
Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
begin
Result := nil;
R := SherpaOnnxOfflineSpeakerDiarizationProcess(Self.Handle, pcfloat(Samples), Length(Samples));
if R = nil then
begin
Exit
end;
NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
SetLength(Result, NumSegments);
for I := Low(Result) to High(Result) do
begin
Result[I].Start := Segments[I].Start;
Result[I].Stop := Segments[I].Stop;
Result[I].Speaker := Segments[I].Speaker;
end;
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
end;
function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single;
callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
var
R: Pointer;
NumSegments: Integer;
I: Integer;
Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
begin
Result := nil;
R := SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(Self.Handle, pcfloat(Samples), Length(Samples), callback);
if R = nil then
begin
Exit
end;
NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
SetLength(Result, NumSegments);
for I := Low(Result) to High(Result) do
begin
Result[I].Start := Segments[I].Start;
Result[I].Stop := Segments[I].Stop;
Result[I].Speaker := Segments[I].Speaker;
end;
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
end;
function TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(' +
'Model := %s)', [Self.Model]);
end;
function TSherpaOnnxOfflineSpeechDenoiserModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeechDenoiserModelConfig(' +
'Gtcrn := %s, '+
'NumThreads := %d, '+
'Debug := %s, '+
'Provider := %s)',
[Self.Gtcrn.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider]);
end;
class operator TSherpaOnnxOfflineSpeechDenoiserModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
function TSherpaOnnxOfflineSpeechDenoiserConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineSpeechDenoiserConfig(' +
'Model := %s)', [Self.Model.ToString]);
end;
constructor TSherpaOnnxOfflineSpeechDenoiser.Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig);
var
C: SherpaOnnxOfflineSpeechDenoiserConfig;
begin
C := Default(SherpaOnnxOfflineSpeechDenoiserConfig);
C.Model.Gtcrn.Model := PAnsiChar(Config.Model.Gtcrn.Model);
C.Model.NumThreads := Config.Model.NumThreads;
C.Model.Debug := Ord(Config.Model.Debug);
C.Model.Provider := PAnsiChar(Config.Model.Provider);
Self.Handle := SherpaOnnxCreateOfflineSpeechDenoiser(@C);
Self._Config := Config;
Self.SampleRate := 0;
if Self.Handle <> nil then
begin
Self.SampleRate := SherpaOnnxOfflineSpeechDenoiserGetSampleRate(Self.Handle);
end;
end;
destructor TSherpaOnnxOfflineSpeechDenoiser.Destroy;
begin
SherpaOnnxDestroyOfflineSpeechDenoiser(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxOfflineSpeechDenoiser.Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio;
var
Audio: PSherpaOnnxDenoisedAudio;
I: Integer;
begin
Result := Default(TSherpaOnnxDenoisedAudio);
Audio := SherpaOnnxOfflineSpeechDenoiserRun(Self.Handle, pcfloat(Samples), Length(Samples), InputSampleRate);
SetLength(Result.Samples, Audio^.N);
Result.SampleRate := Audio^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
begin
Result.Samples[I] := Audio^.Samples[I];
end;
SherpaOnnxDestroyDenoisedAudio(audio);
end;
end.