/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="filterbank"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="preEmphasis"></param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public FilterbankExtractor(int samplingRate, float[][] filterbank, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, double preEmphasis = 0, NonLinearityType nonLinearity = NonLinearityType.None, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FilterBank = filterbank; FeatureCount = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbank.Length]; }
/// <summary> /// Standard method for computing mfcc features: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum X /// 3) Apply mel filters and log() the result: Y = Log10(X * H) /// 4) Do dct-II: mfcc = Dct(Y) /// 5) [Optional] liftering of mfcc /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of mfcc vectors</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var hopSize = HopSize; var frameSize = FrameSize; var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing _zeroblock.FastCopyTo(_block, _fftSize); samples.FastCopyTo(_block, _windowSamples.Length, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply mel filterbank and take log() of the result FilterBanks.ApplyAndLog(FilterBank, _spectrum, _logMelSpectrum); // 4) dct-II var mfccs = new float[FeatureCount]; _dct.Direct(_logMelSpectrum, mfccs); // 5) (optional) liftering if (_lifterCoeffs != null) { mfccs.ApplyWindow(_lifterCoeffs); } // add mfcc vector to output sequence featureVectors.Add(new FeatureVector { Features = mfccs, TimePosition = (double)i / SamplingRate }); i += hopSize; } return(featureVectors); }
/// <summary> /// Constructs extractor from configuration <paramref name="options"/>. /// </summary> public MfccExtractor(MfccOptions options) : base(options) { FeatureCount = options.FeatureCount; var filterbankSize = options.FilterBankSize; if (options.FilterBank is null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _lifterSize = options.LifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = options.IncludeEnergy; _logEnergyFloor = options.LogEnergyFloor; // setup DCT: ============================================================================ _dctType = options.DctType; switch (_dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: _dct = new Dct2(filterbankSize); break; } if (_dctType.EndsWith("N", StringComparison.OrdinalIgnoreCase)) { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// Constructor /// </summary> /// <param name="options">Filterbank options</param> public FilterbankExtractor(FilterbankOptions options) : base(options) { var filterbankSize = options.FilterBankSize; if (options.FilterBank == null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency, false); FilterBank = FilterBanks.Rectangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } FeatureCount = filterbankSize; _fft = new RealFft(_blockSize); // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbankSize]; }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="filterbank"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="includeEnergy"></param> /// <param name="dctType">"1", "1N", "2", "2N", "3", "3N", "4", "4N"</param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public MfccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int filterbankSize = 24, double lowFreq = 0, double highFreq = 0, int fftSize = 0, float[][] filterbank = null, int lifterSize = 0, double preEmphasis = 0, bool includeEnergy = false, string dctType = "2N", NonLinearityType nonLinearity = NonLinearityType.Log10, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; _lowFreq = lowFreq; _highFreq = highFreq; if (filterbank == null) { _blockSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, _blockSize, SamplingRate, _lowFreq, _highFreq); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = filterbank; filterbankSize = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = includeEnergy; // setup DCT: ============================================================================ _dctType = dctType; switch (dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '2': _dct = new Dct2(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: throw new ArgumentException("Only DCT-1, 2, 3 and 4 are supported!"); } if (dctType.Length > 1 && char.ToUpper(dctType[1]) == 'N') { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// Standard method for computing mfcc features: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum X /// 3) Apply mel filters and log() the result: Y = Log10(X * H) /// 4) Do dct-II: mfcc = Dct(Y) /// 5) [Optional] liftering of mfcc /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of mfcc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _melFilterBank = FilterBanks.Triangular(fftSize, signal.SamplingRate, FilterBanks.MelBands(_filterbankSize, fftSize, signal.SamplingRate, _lowFreq, _highFreq)); var lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); // reserve memory for reusable blocks var spectrum = new float[fftSize / 2 + 1]; var logMelSpectrum = new float[_filterbankSize]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // just a buffer of zeros for quick memset // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, windowSamples.Length, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply mel filterbank and take log() of the result FilterBanks.ApplyAndLog(_melFilterBank, spectrum, logMelSpectrum); // 4) dct-II var mfccs = new float[FeatureCount]; dct.Direct(logMelSpectrum, mfccs); // 5) (optional) liftering if (lifterCoeffs != null) { mfccs.ApplyWindow(lifterCoeffs); } // add mfcc vector to output sequence featureVectors.Add(new FeatureVector { Features = mfccs, TimePosition = (double)i / signal.SamplingRate }); i += hopSize; } return(featureVectors); }