/// <summary> /// <para>Computes S(implified)PNCC vector in one frame according to [Kim and Stern, 2016].</para> /// <para> /// General algorithm: /// <list type="number"> /// <item>Apply window</item> /// <item>Obtain power spectrum</item> /// <item>Apply gammatone filters (squared)</item> /// <item>Mean power normalization</item> /// <item>Apply nonlinearity</item> /// <item>Do DCT-II (normalized)</item> /// </list> /// </para> /// </summary> /// <param name="block">Block of data</param> /// <param name="features">Features (one SPNCC feature vector) computed in the block</param> public override void ProcessFrame(float[] block, float[] features) { const float meanPower = 1e10f; // 0) base extractor applies window // 1) calculate power spectrum _fft.PowerSpectrum(block, _spectrum, false); // 2) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _filteredSpectrum); // 3) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < _filteredSpectrum.Length; j++) { sumPower += _filteredSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] *= meanPower / _mean; } // 4) nonlinearity (pow ^ d or Log10) if (_power != 0) { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Pow(_filteredSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Log10(_filteredSpectrum[j] + float.Epsilon); } } // 5) dct-II (normalized) _dct.DirectNorm(_filteredSpectrum, features); // 6) (optional) replace first coeff with log(energy) if (_includeEnergy) { features[0] = (float)Math.Log(Math.Max(block.Sum(x => x * x), _logEnergyFloor)); } }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override float[] ProcessFrame(float[] block) { const float MeanPower = 1e10f; const float Epsilon = 2.22e-16f; // fill zeros to fftSize if frameSize < fftSize for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum _fft.PowerSpectrum(block, _spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(_gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (_step == 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { _spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (_step >= 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { if (spectrumQ[j] > _spectrumQOut[j]) { _spectrumQOut[j] = LambdaA * _spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { _spectrumQOut[j] = LambdaB * _spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < _filteredSpectrumQ.Length; j++) { _filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - _spectrumQOut[j], 0.0f); if (_step == 2 * M) { _avgSpectrumQ1[j] = 0.9f * _filteredSpectrumQ[j]; _avgSpectrumQ2[j] = _filteredSpectrumQ[j]; } if (_filteredSpectrumQ[j] > _avgSpectrumQ1[j]) { _avgSpectrumQ1[j] = LambdaA * _avgSpectrumQ1[j] + (1 - LambdaA) * _filteredSpectrumQ[j]; } else { _avgSpectrumQ1[j] = LambdaB * _avgSpectrumQ1[j] + (1 - LambdaB) * _filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = _filteredSpectrumQ[j]; _avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * _spectrumQOut[j]) { _filteredSpectrumQ[j] = _avgSpectrumQ1[j]; } else { if (_filteredSpectrumQ[j] <= _avgSpectrumQ2[j]) { _filteredSpectrumQ[j] = MuT * _avgSpectrumQ2[j]; } } _avgSpectrumQ2[j] = Math.Max(_avgSpectrumQ2[j], threshold); _filteredSpectrumQ[j] = Math.Max(_filteredSpectrumQ[j], _avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < _spectrumS.Length; j++) { _spectrumS[j] = _filteredSpectrumQ[j] / Math.Max(spectrumQ[j], Epsilon); } for (var j = 0; j < _smoothedSpectrumS.Length; j++) { _smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, FilterBank.Length); k++, total++) { _smoothedSpectrumS[j] += _spectrumS[k]; } _smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = _smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += _smoothedSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] *= MeanPower / _mean; } // ============================================================= // 5) nonlinearity (power ^ d or Log) if (_power != 0) { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Pow(_smoothedSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Log(_smoothedSpectrum[j] + Epsilon); } } // 6) dct-II (Norm = normalized) var pnccs = new float[FeatureCount]; _dct.DirectNorm(_smoothedSpectrum, pnccs); // wow, who knows, maybe it'll happen! if (_step == int.MaxValue - 1) { _step = 2 * M + 1; } return(pnccs); } // first 2*M vectors are zeros return(new float[FeatureCount]); }
/// <summary> /// <para>Computes PLP-RASTA feature vector in one frame.</para> /// <para> /// General algorithm: /// <list type="number"> /// <item>Apply window</item> /// <item>Obtain power spectrum</item> /// <item>Apply filterbank of bark bands (or mel bands)</item> /// <item>[Optional] filter each component of the processed spectrum with a RASTA filter</item> /// <item>Apply equal loudness curve</item> /// <item>Apply nonlinearity (take cubic root)</item> /// <item>Do LPC</item> /// <item>Convert LPC to cepstrum</item> /// <item>[Optional] lifter cepstrum</item> /// </list> /// </para> /// </summary> /// <param name="block">Block of data</param> /// <param name="features">Features (one PLP feature vector) computed in the block</param> public override void ProcessFrame(float[] block, float[] features) { // 0) base extractor applies window // 1) calculate power spectrum (without normalization) _fft.PowerSpectrum(block, _spectrum, false); // 2) apply filterbank on the result (bark frequencies by default) FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); // 3) RASTA filtering in log-domain [optional] if (_rasta > 0) { for (var k = 0; k < _bandSpectrum.Length; k++) { var log = (float)Math.Log(_bandSpectrum[k] + float.Epsilon); log = _rastaFilters[k].Process(log); _bandSpectrum[k] = (float)Math.Exp(log); } } // 4) and 5) apply equal loudness curve and take cubic root for (var k = 0; k < _bandSpectrum.Length; k++) { _bandSpectrum[k] = (float)Math.Pow(Math.Max(_bandSpectrum[k], 1.0) * _equalLoudnessCurve[k], 0.33); } // 6) LPC from power spectrum: var n = _idftTable[0].Length; // get autocorrelation samples from post-processed power spectrum (via IDFT): for (var k = 0; k < _idftTable.Length; k++) { var acc = _idftTable[k][0] * _bandSpectrum[0] + _idftTable[k][n - 1] * _bandSpectrum[n - 3]; // add values at two duplicated edges right away for (var j = 1; j < n - 1; j++) { acc += _idftTable[k][j] * _bandSpectrum[j - 1]; } _cc[k] = acc / (2 * (n - 1)); } // LPC: for (var k = 0; k < _lpc.Length; _lpc[k] = 0, k++) { ; } var err = Lpc.LevinsonDurbin(_cc, _lpc, _lpcOrder); // 7) compute LPCC coefficients from LPC Lpc.ToCepstrum(_lpc, err, features); // 8) (optional) liftering if (_lifterCoeffs != null) { features.ApplyWindow(_lifterCoeffs); } // 9) (optional) replace first coeff with log(energy) if (_includeEnergy) { features[0] = (float)Math.Log(Math.Max(block.Sum(x => x * x), _logEnergyFloor)); } }
/// <summary> /// Computes chroma feature vector in one frame. /// </summary> /// <param name="block">Block of data</param> /// <param name="features">Features (one chroma feature vector) computed in the block</param> public override void ProcessFrame(float[] block, float[] features) { _fft.PowerSpectrum(block, _spectrum, false); FilterBanks.Apply(_filterBank, _spectrum, features); }
/// <summary> /// Method for computing modulation spectra. /// Each vector representing one modulation spectrum is a flattened version of 2D spectrum. /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of flattened modulation spectra</returns> public override List <float[]> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var frameSize = FrameSize; var hopSize = HopSize; var featureVectors = new List <float[]>(); var en = 0; var i = startSample; if (_featuregram == null) { _envelopes = new float[_filterbank.Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[samples.Length / hopSize]; } var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var lastSample = endSample - Math.Max(frameSize, hopSize); // ===================== compute local FFTs (do STFT) ======================= for (i = startSample; i < lastSample; i += hopSize) { // copy frameSize samples samples.FastCopyTo(_block, frameSize, i); // fill zeros to fftSize if frameSize < fftSize for (var k = frameSize; k < _block.Length; _block[k++] = 0) { ; } // 0) pre-emphasis (if needed) if (_preEmphasis > 1e-10f) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply filterbank... FilterBanks.Apply(_filterbank, _spectrum, _filteredSpectrum); // ...and save results for future calculations for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n][en] = _filteredSpectrum[n]; } en++; } } else { en = _featuregram.Length; _envelopes = new float[_featuregram[0].Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[en]; for (i = 0; i < en; i++) { _envelopes[n][i] = _featuregram[i][n]; } } } // =========================== modulation analysis ======================= var envelopeLength = en; // long-term AVG-normalization foreach (var envelope in _envelopes) { var avg = 0.0f; for (var k = 0; k < envelopeLength; k++) { avg += (k >= 0) ? envelope[k] : -envelope[k]; } avg /= envelopeLength; if (avg >= 1e-10f) // this happens more frequently { for (var k = 0; k < envelopeLength; k++) { envelope[k] /= avg; } } } i = 0; while (i < envelopeLength) { var vector = new float[_envelopes.Length * (_modulationFftSize / 2 + 1)]; var offset = 0; foreach (var envelope in _envelopes) { // copy modFftSize samples (or envelopeLength - i in the end) var len = Math.Min(_modulationFftSize, envelopeLength - i); envelope.FastCopyTo(_modBlock, len, i); // fill zeros to modFftSize if len < modFftSize for (var k = len; k < _modBlock.Length; _modBlock[k++] = 0) { } _modulationFft.PowerSpectrum(_modBlock, _modSpectrum); _modSpectrum.FastCopyTo(vector, _modSpectrum.Length, 0, offset); offset += _modSpectrum.Length; } featureVectors.Add(vector); i += _modulationHopSize; } return(featureVectors); }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="filterbank"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="preEmphasis"></param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public FilterbankExtractor(int samplingRate, float[][] filterbank, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, double preEmphasis = 0, NonLinearityType nonLinearity = NonLinearityType.None, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FilterBank = filterbank; FeatureCount = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbank.Length]; }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var hopSize = HopSize; var frameSize = FrameSize; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var lastSample = endSample - Math.Max(frameSize, hopSize); var featureVectors = new List <FeatureVector>(); for (int timePos = startSample, i = 0; timePos < lastSample; timePos += hopSize, i++) { // prepare next block for processing _zeroblock.FastCopyTo(_block, _fftSize); samples.FastCopyTo(_block, frameSize, timePos); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(_gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (i == 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { _spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (i >= 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { if (spectrumQ[j] > _spectrumQOut[j]) { _spectrumQOut[j] = LambdaA * _spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { _spectrumQOut[j] = LambdaB * _spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < _filteredSpectrumQ.Length; j++) { _filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - _spectrumQOut[j], 0.0f); if (i == 2 * M) { _avgSpectrumQ1[j] = 0.9f * _filteredSpectrumQ[j]; _avgSpectrumQ2[j] = _filteredSpectrumQ[j]; } if (_filteredSpectrumQ[j] > _avgSpectrumQ1[j]) { _avgSpectrumQ1[j] = LambdaA * _avgSpectrumQ1[j] + (1 - LambdaA) * _filteredSpectrumQ[j]; } else { _avgSpectrumQ1[j] = LambdaB * _avgSpectrumQ1[j] + (1 - LambdaB) * _filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = _filteredSpectrumQ[j]; _avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * _spectrumQOut[j]) { _filteredSpectrumQ[j] = _avgSpectrumQ1[j]; } else { if (_filteredSpectrumQ[j] <= _avgSpectrumQ2[j]) { _filteredSpectrumQ[j] = MuT * _avgSpectrumQ2[j]; } } _avgSpectrumQ2[j] = Math.Max(_avgSpectrumQ2[j], threshold); _filteredSpectrumQ[j] = Math.Max(_filteredSpectrumQ[j], _avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < _spectrumS.Length; j++) { _spectrumS[j] = _filteredSpectrumQ[j] / Math.Max(spectrumQ[j], float.Epsilon); } for (var j = 0; j < _smoothedSpectrumS.Length; j++) { _smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, _filterbankSize); k++, total++) { _smoothedSpectrumS[j] += _spectrumS[k]; } _smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = _smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += _smoothedSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] *= meanPower / mean; } // ============================================================= // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Pow(_smoothedSpectrum[j], d); } } else { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Log10(_smoothedSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var pnccs = new float[FeatureCount]; _dct.DirectN(_smoothedSpectrum, pnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = pnccs, TimePosition = (double)timePos / SamplingRate }); } // first 2*M vectors are empty else { featureVectors.Add(new FeatureVector { Features = new float[FeatureCount], TimePosition = (double)timePos / SamplingRate }); } } return(featureVectors); }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _gammatoneFilterBank = FilterBanks.Erb(_filterbankSize, _fftSize, signal.SamplingRate, _lowFreq, _highFreq); // use power spectrum: foreach (var filter in _gammatoneFilterBank) { for (var j = 0; j < filter.Length; j++) { var ps = filter[j] * filter[j]; filter[j] = ps; } } var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); var gammatoneSpectrum = new float[_filterbankSize]; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var block = new float[fftSize]; // buffer for a signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset var spectrum = new float[fftSize / 2 + 1]; // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(_gammatoneFilterBank, spectrum, gammatoneSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < gammatoneSpectrum.Length; j++) { sumPower += gammatoneSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] *= meanPower / mean; } // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] = (float)Math.Pow(gammatoneSpectrum[j], d); } } else { for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] = (float)Math.Log10(gammatoneSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; dct.DirectN(gammatoneSpectrum, spnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = spnccs, TimePosition = (double)i / signal.SamplingRate }); i += hopSize; } return(featureVectors); }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="filterbank"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="includeEnergy"></param> /// <param name="dctType">"1", "1N", "2", "2N", "3", "3N", "4", "4N"</param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public MfccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int filterbankSize = 24, double lowFreq = 0, double highFreq = 0, int fftSize = 0, float[][] filterbank = null, int lifterSize = 0, double preEmphasis = 0, bool includeEnergy = false, string dctType = "2N", NonLinearityType nonLinearity = NonLinearityType.Log10, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; _lowFreq = lowFreq; _highFreq = highFreq; if (filterbank == null) { _blockSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, _blockSize, SamplingRate, _lowFreq, _highFreq); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = filterbank; filterbankSize = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = includeEnergy; // setup DCT: ============================================================================ _dctType = dctType; switch (dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '2': _dct = new Dct2(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: throw new ArgumentException("Only DCT-1, 2, 3 and 4 are supported!"); } if (dctType.Length > 1 && char.ToUpper(dctType[1]) == 'N') { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// Constructor /// </summary> /// <param name="options">Filterbank options</param> public FilterbankExtractor(FilterbankOptions options) : base(options) { var filterbankSize = options.FilterBankSize; if (options.FilterBank == null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency, false); FilterBank = FilterBanks.Rectangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } FeatureCount = filterbankSize; _fft = new RealFft(_blockSize); // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbankSize]; }
/// <summary> /// Method for computing modulation spectra. /// Each vector representing one modulation spectrum is a flattened version of 2D spectrum. /// </summary> /// <param name="signal">Signal under analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of flattened modulation spectra</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); var fft = new Fft(fftSize); var modulationFft = new Fft(_modulationFftSize); if (_featuregram == null) { if (_filterbank == null) { _filterbank = FilterBanks.Triangular(_fftSize, signal.SamplingRate, FilterBanks.MelBands(12, _fftSize, signal.SamplingRate, 100, 3200)); } _featureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); } else { _featureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } var length = _filterbank?.Length ?? _featuregram[0].Length; var modulationSamplingRate = (float)signal.SamplingRate / hopSize; var resolution = modulationSamplingRate / _modulationFftSize; _featureDescriptions = new string[length * (_modulationFftSize / 2 + 1)]; var idx = 0; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { _featureDescriptions[idx++] = string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution); } } // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { var preemphasisFilter = new PreEmphasisFilter(_preEmphasis); signal = preemphasisFilter.ApplyTo(signal); } // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var en = 0; var i = startSample; if (_featuregram == null) { _envelopes = new float[_filterbank.Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[signal.Length / hopSize]; } var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; // ===================== compute local FFTs (do STFT) ======================= var spectrum = new float[fftSize / 2 + 1]; var filteredSpectrum = new float[_filterbank.Length]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset while (i + frameSize < endSample) { zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply filterbank... FilterBanks.Apply(_filterbank, spectrum, filteredSpectrum); // ...and save results for future calculations for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n][en] = filteredSpectrum[n]; } en++; i += hopSize; } } else { en = _featuregram.Length; _envelopes = new float[_featuregram[0].Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[en]; for (i = 0; i < en; i++) { _envelopes[n][i] = _featuregram[i][n]; } } } // =========================== modulation analysis ======================= var envelopeLength = en; // long-term AVG-normalization foreach (var envelope in _envelopes) { var avg = 0.0f; for (var k = 0; k < envelopeLength; k++) { avg += (k >= 0) ? envelope[k] : -envelope[k]; } avg /= envelopeLength; if (avg >= 1e-10) // this happens more frequently { for (var k = 0; k < envelopeLength; k++) { envelope[k] /= avg; } } } var modBlock = new float[_modulationFftSize]; var zeroModblock = new float[_modulationFftSize]; var modSpectrum = new float[_modulationFftSize / 2 + 1]; i = 0; while (i < envelopeLength) { var vector = new float[_envelopes.Length * (_modulationFftSize / 2 + 1)]; var offset = 0; foreach (var envelope in _envelopes) { zeroModblock.FastCopyTo(modBlock, _modulationFftSize); envelope.FastCopyTo(modBlock, Math.Min(_modulationFftSize, envelopeLength - i), i); modulationFft.PowerSpectrum(modBlock, modSpectrum); modSpectrum.FastCopyTo(vector, modSpectrum.Length, 0, offset); offset += modSpectrum.Length; } featureVectors.Add(new FeatureVector { Features = vector, TimePosition = (double)i * hopSize / signal.SamplingRate }); i += _modulationHopSize; } return(featureVectors); }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]. /// In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <returns>List of pncc vectors</returns> public override float[] ProcessFrame(float[] block) { const float meanPower = 1e10f; // fill zeros to fftSize if frameSize < fftSize for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum _fft.PowerSpectrum(block, _spectrum, false); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _filteredSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < _filteredSpectrum.Length; j++) { sumPower += _filteredSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] *= meanPower / _mean; } // 5) nonlinearity (pow ^ d or Log10) if (_power != 0) { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Pow(_filteredSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Log10(_filteredSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; _dct.DirectNorm(_filteredSpectrum, spnccs); return(spnccs); }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var frameSize = FrameSize; var hopSize = HopSize; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var i = startSample; while (i + FrameSize < endSample) { // prepare next block for processing _zeroblock.FastCopyTo(_block, _zeroblock.Length); samples.FastCopyTo(_block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _filteredSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < _filteredSpectrum.Length; j++) { sumPower += _filteredSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] *= meanPower / mean; } // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Pow(_filteredSpectrum[j], d); } } else { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Log10(_filteredSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; _dct.DirectN(_filteredSpectrum, spnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = spnccs, TimePosition = (double)i / SamplingRate }); i += hopSize; } return(featureVectors); }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _gammatoneFilterBank = FilterBanks.Erb(_filterbankSize, _fftSize, signal.SamplingRate, _lowFreq, _highFreq); // use power spectrum: foreach (var filter in _gammatoneFilterBank) { for (var j = 0; j < filter.Length; j++) { var ps = filter[j] * filter[j]; filter[j] = ps; } } var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); var gammatoneSpectrum = new float[_filterbankSize]; var spectrumQOut = new float[_filterbankSize]; var filteredSpectrumQ = new float[_filterbankSize]; var spectrumS = new float[_filterbankSize]; var smoothedSpectrumS = new float[_filterbankSize]; var avgSpectrumQ1 = new float[_filterbankSize]; var avgSpectrumQ2 = new float[_filterbankSize]; var smoothedSpectrum = new float[_filterbankSize]; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset _ringBuffer = new SpectraRingBuffer(2 * M + 1, _filterbankSize); var spectrum = new float[fftSize / 2 + 1]; // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { var preemphasisFilter = new PreEmphasisFilter(_preEmphasis); signal = preemphasisFilter.ApplyTo(signal); } // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var i = 0; var timePos = startSample; while (timePos + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, timePos); // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(_gammatoneFilterBank, spectrum, gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (i == 2 * M) { for (var j = 0; j < spectrumQOut.Length; j++) { spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (i >= 2 * M) { for (var j = 0; j < spectrumQOut.Length; j++) { if (spectrumQ[j] > spectrumQOut[j]) { spectrumQOut[j] = LambdaA * spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { spectrumQOut[j] = LambdaB * spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < filteredSpectrumQ.Length; j++) { filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - spectrumQOut[j], 0.0f); if (i == 2 * M) { avgSpectrumQ1[j] = 0.9f * filteredSpectrumQ[j]; avgSpectrumQ2[j] = filteredSpectrumQ[j]; } if (filteredSpectrumQ[j] > avgSpectrumQ1[j]) { avgSpectrumQ1[j] = LambdaA * avgSpectrumQ1[j] + (1 - LambdaA) * filteredSpectrumQ[j]; } else { avgSpectrumQ1[j] = LambdaB * avgSpectrumQ1[j] + (1 - LambdaB) * filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = filteredSpectrumQ[j]; avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * spectrumQOut[j]) { filteredSpectrumQ[j] = avgSpectrumQ1[j]; } else { if (filteredSpectrumQ[j] <= avgSpectrumQ2[j]) { filteredSpectrumQ[j] = MuT * avgSpectrumQ2[j]; } } avgSpectrumQ2[j] = Math.Max(avgSpectrumQ2[j], threshold); filteredSpectrumQ[j] = Math.Max(filteredSpectrumQ[j], avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < spectrumS.Length; j++) { spectrumS[j] = filteredSpectrumQ[j] / Math.Max(spectrumQ[j], float.Epsilon); } for (var j = 0; j < smoothedSpectrumS.Length; j++) { smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, _filterbankSize); k++, total++) { smoothedSpectrumS[j] += spectrumS[k]; } smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += smoothedSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] *= meanPower / mean; } // ============================================================= // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = (float)Math.Pow(smoothedSpectrum[j], d); } } else { for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = (float)Math.Log10(smoothedSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var pnccs = new float[FeatureCount]; dct.DirectN(smoothedSpectrum, pnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = pnccs, TimePosition = (double)timePos / signal.SamplingRate }); } i++; timePos += hopSize; } return(featureVectors); }
/// <summary> /// Constructs extractor from configuration <paramref name="options"/>. /// </summary> public MfccExtractor(MfccOptions options) : base(options) { FeatureCount = options.FeatureCount; var filterbankSize = options.FilterBankSize; if (options.FilterBank is null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _lifterSize = options.LifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = options.IncludeEnergy; _logEnergyFloor = options.LogEnergyFloor; // setup DCT: ============================================================================ _dctType = options.DctType; switch (_dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: _dct = new Dct2(filterbankSize); break; } if (_dctType.EndsWith("N", StringComparison.OrdinalIgnoreCase)) { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// <para>Computes PNCC vector in one frame according to [Kim and Stern, 2016].</para> /// <para> /// General algorithm: /// <list type="number"> /// <item>Apply window</item> /// <item>Obtain power spectrum</item> /// <item>Apply gammatone filters (squared)</item> /// <item>Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing)</item> /// <item>Apply nonlinearity</item> /// <item>Do DCT-II (normalized)</item> /// </list> /// </para> /// </summary> /// <param name="block">Block of data</param> /// <param name="features">Features (one PNCC feature vector) computed in the block</param> public override void ProcessFrame(float[] block, float[] features) { const float MeanPower = 1e10f; const float Epsilon = 2.22e-16f; _step++; // 0) base extractor applies window // 1) calculate power spectrum _fft.PowerSpectrum(block, _spectrum, false); // 2) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _gammatoneSpectrum); // ============================================================= // 3) medium-time processing blocks: // 3.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(_gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 3.2) asymmetric noise suppression if (_step == 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { _spectrumQOut[j] = spectrumQ[j] * 0.9f; } } // first 2*M vectors are zeros if (_step < 2 * M) { return; } for (var j = 0; j < _spectrumQOut.Length; j++) { if (spectrumQ[j] > _spectrumQOut[j]) { _spectrumQOut[j] = LambdaA * _spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { _spectrumQOut[j] = LambdaB * _spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < _filteredSpectrumQ.Length; j++) { _filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - _spectrumQOut[j], 0.0f); if (_step == 2 * M) { _avgSpectrumQ1[j] = 0.9f * _filteredSpectrumQ[j]; _avgSpectrumQ2[j] = _filteredSpectrumQ[j]; } if (_filteredSpectrumQ[j] > _avgSpectrumQ1[j]) { _avgSpectrumQ1[j] = LambdaA * _avgSpectrumQ1[j] + (1 - LambdaA) * _filteredSpectrumQ[j]; } else { _avgSpectrumQ1[j] = LambdaB * _avgSpectrumQ1[j] + (1 - LambdaB) * _filteredSpectrumQ[j]; } // 3.3) temporal masking var threshold = _filteredSpectrumQ[j]; _avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * _spectrumQOut[j]) { _filteredSpectrumQ[j] = _avgSpectrumQ1[j]; } else { if (_filteredSpectrumQ[j] <= _avgSpectrumQ2[j]) { _filteredSpectrumQ[j] = MuT * _avgSpectrumQ2[j]; } } _avgSpectrumQ2[j] = Math.Max(_avgSpectrumQ2[j], threshold); _filteredSpectrumQ[j] = Math.Max(_filteredSpectrumQ[j], _avgSpectrumQ1[j]); } // 3.4) spectral smoothing for (var j = 0; j < _spectrumS.Length; j++) { _spectrumS[j] = _filteredSpectrumQ[j] / Math.Max(spectrumQ[j], Epsilon); } for (var j = 0; j < _smoothedSpectrumS.Length; j++) { _smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, FilterBank.Length); k++, total++) { _smoothedSpectrumS[j] += _spectrumS[k]; } _smoothedSpectrumS[j] /= total; } // 3.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = _smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += _smoothedSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] /= _mean; _smoothedSpectrum[j] *= MeanPower; } // ============================================================= // 4) nonlinearity (power ^ d or Log) if (_power != 0) { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Pow(_smoothedSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Log(_smoothedSpectrum[j] + Epsilon); } } // 5) DCT-II (Norm = normalized) _dct.DirectNorm(_smoothedSpectrum, features); // 6) (optional) replace first coeff with log(energy) if (_includeEnergy) { features[0] = (float)Math.Log(Math.Max(block.Sum(x => x * x), _logEnergyFloor)); } // wow, who knows, maybe it'll happen! (not really))) if (_step == int.MaxValue - 1) { _step = 2 * M + 1; } }
/// <summary> /// Standard method for computing PLP features. /// In each frame do: /// /// 1) Apply window /// 2) Obtain power spectrum /// 3) Apply filterbank of bark bands (or mel bands) /// 4) [Optional] filter each component of the processed spectrum with a RASTA filter /// 5) Apply equal loudness curve /// 6) Take cubic root /// 7) Do LPC /// 8) Convert LPC to cepstrum /// 9) [Optional] lifter cepstrum /// /// </summary> /// <param name="block">Samples for analysis</param> /// <returns>PLP vector</returns> public override float[] ProcessFrame(float[] block) { // fill zeros to fftSize if frameSize < fftSize (blockSize) for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum (without normalization) _fft.PowerSpectrum(block, _spectrum, false); // 3) apply filterbank on the result (bark frequencies by default) FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); // 4) RASTA filtering in log-domain [optional] if (_rasta > 0) { for (var k = 0; k < _bandSpectrum.Length; k++) { var log = (float)Math.Log(_bandSpectrum[k] + float.Epsilon); log = _rastaFilters[k].Process(log); _bandSpectrum[k] = (float)Math.Exp(log); } } // 5) and 6) apply equal loudness curve and take cubic root for (var k = 0; k < _bandSpectrum.Length; k++) { _bandSpectrum[k] = (float)Math.Pow(Math.Max(_bandSpectrum[k], 1.0) * _equalLoudnessCurve[k], 0.33); } // 7) LPC from power spectrum: var n = _idftTable[0].Length; // get autocorrelation samples from post-processed power spectrum (via IDFT): for (var k = 0; k < _idftTable.Length; k++) { var acc = _idftTable[k][0] * _bandSpectrum[0] + _idftTable[k][n - 1] * _bandSpectrum[n - 3]; // add values at two duplicated edges right away for (var j = 1; j < n - 1; j++) { acc += _idftTable[k][j] * _bandSpectrum[j - 1]; } _cc[k] = acc / (2 * (n - 1)); } // LPC: for (var k = 0; k < _lpc.Length; _lpc[k] = 0, k++) { ; } var err = Lpc.LevinsonDurbin(_cc, _lpc, _lpcOrder); // 8) compute LPCC coefficients from LPC var lpcc = new float[FeatureCount]; Lpc.ToCepstrum(_lpc, err, lpcc); // 9) (optional) liftering if (_lifterCoeffs != null) { lpcc.ApplyWindow(_lifterCoeffs); } return(lpcc); }