/// <summary> /// Standard method for computing mfcc features: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum X /// 3) Apply mel filters and log() the result: Y = Log10(X * H) /// 4) Do dct-II: mfcc = Dct(Y) /// 5) [Optional] liftering of mfcc /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of mfcc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _melFilterBank = FilterBanks.Triangular(fftSize, signal.SamplingRate, FilterBanks.MelBands(_filterbankSize, fftSize, signal.SamplingRate, _lowFreq, _highFreq)); var lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); // reserve memory for reusable blocks var spectrum = new float[fftSize / 2 + 1]; var logMelSpectrum = new float[_filterbankSize]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // just a buffer of zeros for quick memset // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, windowSamples.Length, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply mel filterbank and take log() of the result FilterBanks.ApplyAndLog(_melFilterBank, spectrum, logMelSpectrum); // 4) dct-II var mfccs = new float[FeatureCount]; dct.Direct(logMelSpectrum, mfccs); // 5) (optional) liftering if (lifterCoeffs != null) { mfccs.ApplyWindow(lifterCoeffs); } // add mfcc vector to output sequence featureVectors.Add(new FeatureVector { Features = mfccs, TimePosition = (double)i / signal.SamplingRate }); i += hopSize; } return(featureVectors); }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]. /// In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <returns>List of pncc vectors</returns> public override float[] ProcessFrame(float[] block) { const float meanPower = 1e10f; // fill zeros to fftSize if frameSize < fftSize for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum _fft.PowerSpectrum(block, _spectrum, false); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _filteredSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < _filteredSpectrum.Length; j++) { sumPower += _filteredSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] *= meanPower / _mean; } // 5) nonlinearity (pow ^ d or Log10) if (_power != 0) { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Pow(_filteredSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Log10(_filteredSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; _dct.DirectNorm(_filteredSpectrum, spnccs); return(spnccs); }
/// <summary> /// Standard method for computing PLP features. /// In each frame do: /// /// 1) Apply window /// 2) Obtain power spectrum /// 3) Apply filterbank of bark bands (or mel bands) /// 4) [Optional] filter each component of the processed spectrum with a RASTA filter /// 5) Apply equal loudness curve /// 6) Take cubic root /// 7) Do LPC /// 8) Convert LPC to cepstrum /// 9) [Optional] lifter cepstrum /// /// </summary> /// <param name="block">Samples for analysis</param> /// <returns>PLP vector</returns> public override float[] ProcessFrame(float[] block) { // fill zeros to fftSize if frameSize < fftSize (blockSize) for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum (without normalization) _fft.PowerSpectrum(block, _spectrum, false); // 3) apply filterbank on the result (bark frequencies by default) FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); // 4) RASTA filtering in log-domain [optional] if (_rasta > 0) { for (var k = 0; k < _bandSpectrum.Length; k++) { var log = (float)Math.Log(_bandSpectrum[k] + float.Epsilon); log = _rastaFilters[k].Process(log); _bandSpectrum[k] = (float)Math.Exp(log); } } // 5) and 6) apply equal loudness curve and take cubic root for (var k = 0; k < _bandSpectrum.Length; k++) { _bandSpectrum[k] = (float)Math.Pow(Math.Max(_bandSpectrum[k], 1.0) * _equalLoudnessCurve[k], 0.33); } // 7) LPC from power spectrum: var n = _idftTable[0].Length; // get autocorrelation samples from post-processed power spectrum (via IDFT): for (var k = 0; k < _idftTable.Length; k++) { var acc = _idftTable[k][0] * _bandSpectrum[0] + _idftTable[k][n - 1] * _bandSpectrum[n - 3]; // add values at two duplicated edges right away for (var j = 1; j < n - 1; j++) { acc += _idftTable[k][j] * _bandSpectrum[j - 1]; } _cc[k] = acc / (2 * (n - 1)); } // LPC: for (var k = 0; k < _lpc.Length; _lpc[k] = 0, k++) { ; } var err = Lpc.LevinsonDurbin(_cc, _lpc, _lpcOrder); // 8) compute LPCC coefficients from LPC var lpcc = new float[FeatureCount]; Lpc.ToCepstrum(_lpc, err, lpcc); // 9) (optional) liftering if (_lifterCoeffs != null) { lpcc.ApplyWindow(_lifterCoeffs); } return(lpcc); }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var frameSize = FrameSize; var hopSize = HopSize; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var i = startSample; while (i + FrameSize < endSample) { // prepare next block for processing _zeroblock.FastCopyTo(_block, _zeroblock.Length); samples.FastCopyTo(_block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _filteredSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < _filteredSpectrum.Length; j++) { sumPower += _filteredSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] *= meanPower / mean; } // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Pow(_filteredSpectrum[j], d); } } else { for (var j = 0; j < _filteredSpectrum.Length; j++) { _filteredSpectrum[j] = (float)Math.Log10(_filteredSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; _dct.DirectN(_filteredSpectrum, spnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = spnccs, TimePosition = (double)i / SamplingRate }); i += hopSize; } return(featureVectors); }
private void buttonCompute_Click(object sender, EventArgs e) { var filterCount = int.Parse(textBoxSize.Text); var samplingRate = _signal.SamplingRate; var fftSize = int.Parse(textBoxFftSize.Text); var lowFreq = float.Parse(textBoxLowFreq.Text); var highFreq = float.Parse(textBoxHighFreq.Text); Tuple <double, double, double>[] bands; float[][] filterbank = null; VtlnWarper vtln = null; if (checkBoxVtln.Checked) { var alpha = float.Parse(textBoxVtlnAlpha.Text); var vtlnLow = float.Parse(textBoxVtlnLow.Text); var vtlnHigh = float.Parse(textBoxVtlnHigh.Text); vtln = new VtlnWarper(alpha, lowFreq, highFreq, vtlnLow, vtlnHigh); } switch (comboBoxFilterbank.Text) { case "Mel": bands = FilterBanks.MelBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "Mel Slaney": bands = FilterBanks.MelBandsSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); filterbank = FilterBanks.MelBankSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxNormalize.Checked, vtln); break; case "Bark": bands = FilterBanks.BarkBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "Bark Slaney": bands = FilterBanks.BarkBandsSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); filterbank = FilterBanks.BarkBankSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Critical bands": bands = FilterBanks.CriticalBands(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Octave bands": bands = FilterBanks.OctaveBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "ERB": bands = null; filterbank = FilterBanks.Erb(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; default: bands = FilterBanks.HerzBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; } if (bands != null && filterbank == null) { switch (comboBoxShape.Text) { case "Triangular": filterbank = FilterBanks.Triangular(fftSize, samplingRate, bands, vtln, Utils.Scale.HerzToMel); break; case "Trapezoidal": filterbank = FilterBanks.Trapezoidal(fftSize, samplingRate, bands, vtln); break; case "BiQuad": filterbank = FilterBanks.BiQuad(fftSize, samplingRate, bands); break; default: filterbank = FilterBanks.Rectangular(fftSize, samplingRate, bands, vtln); break; } if (checkBoxNormalize.Checked) { FilterBanks.Normalize(filterCount, bands, filterbank); } } var spectrumType = (SpectrumType)comboBoxSpectrum.SelectedIndex; var nonLinearity = (NonLinearityType)comboBoxNonLinearity.SelectedIndex; var logFloor = float.Parse(textBoxLogFloor.Text); var mfccExtractor = new MfccExtractor(//samplingRate, 13, 0.025, 0.01, samplingRate, 13, 512.0 / samplingRate, 0.01, filterbank: filterbank, //filterbankSize: 26, //highFreq: 8000, //preEmphasis: 0.97, //lifterSize: 22, //includeEnergy: true, spectrumType: spectrumType, nonLinearity: nonLinearity, dctType: comboBoxDct.Text, window: WindowTypes.Hamming, logFloor: logFloor); _mfccVectors = mfccExtractor.ComputeFrom(_signal); //_mfccVectors = mfccExtractor.ComputeFrom(_signal * 32768); //var mfccVectorsP = mfccExtractor.ParallelComputeFrom(_signal * 32768); //for (var i = 0; i < _mfccVectors.Count; i++) //{ // for (var j = 0; j < _mfccVectors[i].Features.Length; j++) // { // if (Math.Abs(_mfccVectors[i].Features[j] - mfccVectorsP[i].Features[j]) > 1e-32f) // { // MessageBox.Show($"Nope: {i} - {j}"); // return; // } // if (Math.Abs(_mfccVectors[i].TimePosition - mfccVectorsP[i].TimePosition) > 1e-32f) // { // MessageBox.Show($"Time: {i} - {j}"); // return; // } // } //} //FeaturePostProcessing.NormalizeMean(_mfccVectors); // optional (but REQUIRED for PNCC!) //FeaturePostProcessing.AddDeltas(_mfccVectors); var header = mfccExtractor.FeatureDescriptions; //.Concat(mfccExtractor.DeltaFeatureDescriptions) //.Concat(mfccExtractor.DeltaDeltaFeatureDescriptions); FillFeaturesList(_mfccVectors, header); mfccListView.Items[0].Selected = true; melFilterBankPanel.Groups = mfccExtractor.FilterBank; mfccPanel.Line = _mfccVectors[0].Features; }
/// <summary> /// Method for computing modulation spectra. /// Each vector representing one modulation spectrum is a flattened version of 2D spectrum. /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of flattened modulation spectra</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var frameSize = FrameSize; var hopSize = HopSize; var featureVectors = new List <FeatureVector>(); var en = 0; var i = startSample; if (_featuregram == null) { _envelopes = new float[_filterbank.Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[samples.Length / hopSize]; } var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var lastSample = endSample - Math.Max(frameSize, hopSize); // ===================== compute local FFTs (do STFT) ======================= for (i = startSample; i < lastSample; i += hopSize) { _zeroblock.FastCopyTo(_block, _zeroblock.Length); samples.FastCopyTo(_block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 1e-10) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply filterbank... FilterBanks.Apply(_filterbank, _spectrum, _filteredSpectrum); // ...and save results for future calculations for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n][en] = _filteredSpectrum[n]; } en++; } } else { en = _featuregram.Length; _envelopes = new float[_featuregram[0].Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[en]; for (i = 0; i < en; i++) { _envelopes[n][i] = _featuregram[i][n]; } } } // =========================== modulation analysis ======================= var envelopeLength = en; // long-term AVG-normalization foreach (var envelope in _envelopes) { var avg = 0.0f; for (var k = 0; k < envelopeLength; k++) { avg += (k >= 0) ? envelope[k] : -envelope[k]; } avg /= envelopeLength; if (avg >= 1e-10) // this happens more frequently { for (var k = 0; k < envelopeLength; k++) { envelope[k] /= avg; } } } i = 0; while (i < envelopeLength) { var vector = new float[_envelopes.Length * (_modulationFftSize / 2 + 1)]; var offset = 0; foreach (var envelope in _envelopes) { _zeroModblock.FastCopyTo(_modBlock, _modulationFftSize); envelope.FastCopyTo(_modBlock, Math.Min(_modulationFftSize, envelopeLength - i), i); _modulationFft.PowerSpectrum(_modBlock, _modSpectrum); _modSpectrum.FastCopyTo(vector, _modSpectrum.Length, 0, offset); offset += _modSpectrum.Length; } featureVectors.Add(new FeatureVector { Features = vector, TimePosition = (double)i * hopSize / SamplingRate }); i += _modulationHopSize; } return(featureVectors); }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _gammatoneFilterBank = FilterBanks.Erb(_filterbankSize, _fftSize, signal.SamplingRate, _lowFreq, _highFreq); // use power spectrum: foreach (var filter in _gammatoneFilterBank) { for (var j = 0; j < filter.Length; j++) { var ps = filter[j] * filter[j]; filter[j] = ps; } } var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); var gammatoneSpectrum = new float[_filterbankSize]; var spectrumQOut = new float[_filterbankSize]; var filteredSpectrumQ = new float[_filterbankSize]; var spectrumS = new float[_filterbankSize]; var smoothedSpectrumS = new float[_filterbankSize]; var avgSpectrumQ1 = new float[_filterbankSize]; var avgSpectrumQ2 = new float[_filterbankSize]; var smoothedSpectrum = new float[_filterbankSize]; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset _ringBuffer = new SpectraRingBuffer(2 * M + 1, _filterbankSize); var spectrum = new float[fftSize / 2 + 1]; // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { var preemphasisFilter = new PreEmphasisFilter(_preEmphasis); signal = preemphasisFilter.ApplyTo(signal); } // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var i = 0; var timePos = startSample; while (timePos + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, timePos); // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(_gammatoneFilterBank, spectrum, gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (i == 2 * M) { for (var j = 0; j < spectrumQOut.Length; j++) { spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (i >= 2 * M) { for (var j = 0; j < spectrumQOut.Length; j++) { if (spectrumQ[j] > spectrumQOut[j]) { spectrumQOut[j] = LambdaA * spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { spectrumQOut[j] = LambdaB * spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < filteredSpectrumQ.Length; j++) { filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - spectrumQOut[j], 0.0f); if (i == 2 * M) { avgSpectrumQ1[j] = 0.9f * filteredSpectrumQ[j]; avgSpectrumQ2[j] = filteredSpectrumQ[j]; } if (filteredSpectrumQ[j] > avgSpectrumQ1[j]) { avgSpectrumQ1[j] = LambdaA * avgSpectrumQ1[j] + (1 - LambdaA) * filteredSpectrumQ[j]; } else { avgSpectrumQ1[j] = LambdaB * avgSpectrumQ1[j] + (1 - LambdaB) * filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = filteredSpectrumQ[j]; avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * spectrumQOut[j]) { filteredSpectrumQ[j] = avgSpectrumQ1[j]; } else { if (filteredSpectrumQ[j] <= avgSpectrumQ2[j]) { filteredSpectrumQ[j] = MuT * avgSpectrumQ2[j]; } } avgSpectrumQ2[j] = Math.Max(avgSpectrumQ2[j], threshold); filteredSpectrumQ[j] = Math.Max(filteredSpectrumQ[j], avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < spectrumS.Length; j++) { spectrumS[j] = filteredSpectrumQ[j] / Math.Max(spectrumQ[j], float.Epsilon); } for (var j = 0; j < smoothedSpectrumS.Length; j++) { smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, _filterbankSize); k++, total++) { smoothedSpectrumS[j] += spectrumS[k]; } smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += smoothedSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] *= meanPower / mean; } // ============================================================= // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = (float)Math.Pow(smoothedSpectrum[j], d); } } else { for (var j = 0; j < smoothedSpectrum.Length; j++) { smoothedSpectrum[j] = (float)Math.Log10(smoothedSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var pnccs = new float[FeatureCount]; dct.DirectN(smoothedSpectrum, pnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = pnccs, TimePosition = (double)timePos / signal.SamplingRate }); } i++; timePos += hopSize; } return(featureVectors); }
/// <summary> /// Constructor /// </summary> /// <param name="options">Filterbank options</param> public FilterbankExtractor(FilterbankOptions options) : base(options) { var filterbankSize = options.FilterBankSize; if (options.FilterBank == null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency, false); FilterBank = FilterBanks.Rectangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } FeatureCount = filterbankSize; _fft = new RealFft(_blockSize); // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbankSize]; }
/// <summary> /// Standard method for computing PLP features. /// In each frame do: /// /// 0) Apply window (base extractor does it) /// 1) Obtain power spectrum /// 2) Apply filterbank of bark bands (or mel bands) /// 3) [Optional] filter each component of the processed spectrum with a RASTA filter /// 4) Apply equal loudness curve /// 5) Take cubic root /// 6) Do LPC /// 7) Convert LPC to cepstrum /// 8) [Optional] lifter cepstrum /// /// </summary> /// <param name="block">Samples for analysis</param> /// <param name="features">PLP vectors</param> public override void ProcessFrame(float[] block, float[] features) { // 1) calculate power spectrum (without normalization) _fft.PowerSpectrum(block, _spectrum, false); // 2) apply filterbank on the result (bark frequencies by default) FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); // 3) RASTA filtering in log-domain [optional] if (_rasta > 0) { for (var k = 0; k < _bandSpectrum.Length; k++) { var log = (float)Math.Log(_bandSpectrum[k] + float.Epsilon); log = _rastaFilters[k].Process(log); _bandSpectrum[k] = (float)Math.Exp(log); } } // 4) and 5) apply equal loudness curve and take cubic root for (var k = 0; k < _bandSpectrum.Length; k++) { _bandSpectrum[k] = (float)Math.Pow(Math.Max(_bandSpectrum[k], 1.0) * _equalLoudnessCurve[k], 0.33); } // 6) LPC from power spectrum: var n = _idftTable[0].Length; // get autocorrelation samples from post-processed power spectrum (via IDFT): for (var k = 0; k < _idftTable.Length; k++) { var acc = _idftTable[k][0] * _bandSpectrum[0] + _idftTable[k][n - 1] * _bandSpectrum[n - 3]; // add values at two duplicated edges right away for (var j = 1; j < n - 1; j++) { acc += _idftTable[k][j] * _bandSpectrum[j - 1]; } _cc[k] = acc / (2 * (n - 1)); } // LPC: for (var k = 0; k < _lpc.Length; _lpc[k] = 0, k++) { ; } var err = Lpc.LevinsonDurbin(_cc, _lpc, _lpcOrder); // 7) compute LPCC coefficients from LPC Lpc.ToCepstrum(_lpc, err, features); // 8) (optional) liftering if (_lifterCoeffs != null) { features.ApplyWindow(_lifterCoeffs); } // 9) (optional) replace first coeff with log(energy) if (_includeEnergy) { features[0] = (float)Math.Log(Math.Max(block.Sum(x => x * x), _logEnergyFloor)); } }
private void filterbankButton_Click(object sender, EventArgs e) { var filterCount = int.Parse(filterCountTextBox.Text); var samplingRate = int.Parse(samplingRateTextBox.Text); var fftSize = int.Parse(fftSizeTextBox.Text); var lowFreq = float.Parse(lowFreqTextBox.Text); var highFreq = float.Parse(highFreqTextBox.Text); Tuple <double, double, double>[] bands; switch (filterbankComboBox.Text) { case "Mel": bands = FilterBanks.MelBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "Bark": bands = FilterBanks.BarkBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "Critical bands": bands = FilterBanks.CriticalBands(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Octave bands": bands = FilterBanks.OctaveBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "ERB": bands = null; _filterbank = FilterBanks.Erb(filterCount, fftSize, samplingRate, lowFreq, highFreq); // ==================================================== // =================== ! SQUARE ! ==================== //foreach (var filter in _filterbank) //{ // for (var j = 0; j < filter.Length; j++) // { // var squared = filter[j] * filter[j]; // filter[j] = squared; // } //} // normalization coefficient (for plotting) var scaleCoeff = (int)(1.0 / _filterbank.Max(f => f.Max())); filterbankPanel.Gain = 100 * scaleCoeff; break; default: bands = FilterBanks.HerzBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; } if (bands != null) { switch (shapeComboBox.Text) { case "Triangular": _filterbank = FilterBanks.Triangular(fftSize, samplingRate, bands); break; case "Trapezoidal": _filterbank = FilterBanks.Trapezoidal(fftSize, samplingRate, bands); break; case "BiQuad": _filterbank = FilterBanks.BiQuad(fftSize, samplingRate, bands); break; default: _filterbank = FilterBanks.Rectangular(fftSize, samplingRate, bands); break; } } band1ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band2ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band3ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band4ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band1ComboBox.Text = "1"; band2ComboBox.Text = "2"; band3ComboBox.Text = "3"; band4ComboBox.Text = "4"; filterbankPanel.Groups = _filterbank; }
/// <summary> /// Constructor /// </summary> /// <param name="options">PLP options</param> public PlpExtractor(PlpOptions options) : base(options) { FeatureCount = options.FeatureCount; // ================================ Prepare filter bank and center frequencies: =========================================== var filterbankSize = options.FilterBankSize; if (options.FilterBank == null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var low = options.LowFrequency; var high = options.HighFrequency; FilterBank = FilterBanks.BarkBankSlaney(filterbankSize, _blockSize, SamplingRate, low, high); var barkBands = FilterBanks.BarkBandsSlaney(filterbankSize, SamplingRate, low, high); _centerFrequencies = barkBands.Select(b => b.Item2).ToArray(); } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); if (options.CenterFrequencies != null) { _centerFrequencies = options.CenterFrequencies; } else { var herzResolution = (double)SamplingRate / _blockSize; // try to determine center frequencies automatically from filterbank weights: _centerFrequencies = new double[filterbankSize]; for (var i = 0; i < FilterBank.Length; i++) { var minPos = 0; var maxPos = _blockSize / 2; for (var j = 0; j < FilterBank[i].Length; j++) { if (FilterBank[i][j] > 0) { minPos = j; break; } } for (var j = minPos; j < FilterBank[i].Length; j++) { if (FilterBank[i][j] == 0) { maxPos = j; break; } } _centerFrequencies[i] = herzResolution * (maxPos + minPos) / 2; } } } // ==================================== Compute equal loudness curve: ========================================= _equalLoudnessCurve = new double[filterbankSize]; for (var i = 0; i < _centerFrequencies.Length; i++) { var level2 = _centerFrequencies[i] * _centerFrequencies[i]; _equalLoudnessCurve[i] = Math.Pow(level2 / (level2 + 1.6e5), 2) * ((level2 + 1.44e6) / (level2 + 9.61e6)); } // ============================== Prepare RASTA filters (if necessary): ======================================= _rasta = options.Rasta; if (_rasta > 0) { _rastaFilters = Enumerable.Range(0, filterbankSize) .Select(f => new RastaFilter(_rasta)) .ToArray(); } // ============== Precompute IDFT table for obtaining autocorrelation coeffs from power spectrum: ============= _lpcOrder = options.LpcOrder > 0 ? options.LpcOrder : FeatureCount - 1; _idftTable = new float[_lpcOrder + 1][]; var bandCount = filterbankSize + 2; // +2 duplicated edges var freq = Math.PI / (bandCount - 1); for (var i = 0; i < _idftTable.Length; i++) { _idftTable[i] = new float[bandCount]; _idftTable[i][0] = 1.0f; for (var j = 1; j < bandCount - 1; j++) { _idftTable[i][j] = 2 * (float)Math.Cos(freq * i * j); } _idftTable[i][bandCount - 1] = (float)Math.Cos(freq * i * (bandCount - 1)); } _lpc = new float[_lpcOrder + 1]; _cc = new float[bandCount]; // =================================== Prepare everything else: ============================== _fft = new RealFft(_blockSize); _lifterSize = options.LifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = options.IncludeEnergy; _logEnergyFloor = options.LogEnergyFloor; _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbankSize]; }
/// <summary> /// Constructs extractor from configuration <paramref name="options"/>. /// </summary> public AmsExtractor(AmsOptions options) : base(options) { _modulationFftSize = options.ModulationFftSize; _modulationHopSize = options.ModulationHopSize; _modulationFft = new RealFft(_modulationFftSize); _featuregram = options.Featuregram?.ToArray(); if (_featuregram != null) { FeatureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } else { if (options.FilterBank is null) { _fftSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbank = FilterBanks.Triangular(_fftSize, SamplingRate, FilterBanks.MelBands(12, SamplingRate, 100, 3200)); } else { _filterbank = options.FilterBank; _fftSize = 2 * (_filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _fftSize, "frame size", "FFT size"); } _fft = new RealFft(_fftSize); FeatureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); _spectrum = new float[_fftSize / 2 + 1]; _filteredSpectrum = new float[_filterbank.Length]; _block = new float[_fftSize]; } _modBlock = new float[_modulationFftSize]; _modSpectrum = new float[_modulationFftSize / 2 + 1]; // feature descriptions int length; if (_featuregram != null) { length = _featuregram[0].Length; } else { length = _filterbank.Length; } FeatureDescriptions = new List <string>(); var modulationSamplingRate = (float)SamplingRate / HopSize; var resolution = modulationSamplingRate / _modulationFftSize; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { FeatureDescriptions.Add(string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution)); } } }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="filterbank"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="preEmphasis"></param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public FilterbankExtractor(int samplingRate, int featureCount, float[][] filterbank, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, double preEmphasis = 0, NonLinearityType nonLinearity = NonLinearityType.None, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; FilterBank = filterbank; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _bandSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _bandSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _bandSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbank.Length]; }
/// <summary> /// Method for computing modulation spectra. /// Each vector representing one modulation spectrum is a flattened version of 2D spectrum. /// </summary> /// <param name="signal">Signal under analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of flattened modulation spectra</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); var fft = new Fft(fftSize); var modulationFft = new Fft(_modulationFftSize); if (_featuregram == null) { if (_filterbank == null) { _filterbank = FilterBanks.Triangular(_fftSize, signal.SamplingRate, FilterBanks.MelBands(12, _fftSize, signal.SamplingRate, 100, 3200)); } _featureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); } else { _featureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } var length = _filterbank?.Length ?? _featuregram[0].Length; var modulationSamplingRate = (float)signal.SamplingRate / hopSize; var resolution = modulationSamplingRate / _modulationFftSize; _featureDescriptions = new string[length * (_modulationFftSize / 2 + 1)]; var idx = 0; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { _featureDescriptions[idx++] = string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution); } } // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { var preemphasisFilter = new PreEmphasisFilter(_preEmphasis); signal = preemphasisFilter.ApplyTo(signal); } // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var en = 0; var i = startSample; if (_featuregram == null) { _envelopes = new float[_filterbank.Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[signal.Length / hopSize]; } var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; // ===================== compute local FFTs (do STFT) ======================= var spectrum = new float[fftSize / 2 + 1]; var filteredSpectrum = new float[_filterbank.Length]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset while (i + frameSize < endSample) { zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply filterbank... FilterBanks.Apply(_filterbank, spectrum, filteredSpectrum); // ...and save results for future calculations for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n][en] = filteredSpectrum[n]; } en++; i += hopSize; } } else { en = _featuregram.Length; _envelopes = new float[_featuregram[0].Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[en]; for (i = 0; i < en; i++) { _envelopes[n][i] = _featuregram[i][n]; } } } // =========================== modulation analysis ======================= var envelopeLength = en; // long-term AVG-normalization foreach (var envelope in _envelopes) { var avg = 0.0f; for (var k = 0; k < envelopeLength; k++) { avg += (k >= 0) ? envelope[k] : -envelope[k]; } avg /= envelopeLength; if (avg >= 1e-10) // this happens more frequently { for (var k = 0; k < envelopeLength; k++) { envelope[k] /= avg; } } } var modBlock = new float[_modulationFftSize]; var zeroModblock = new float[_modulationFftSize]; var modSpectrum = new float[_modulationFftSize / 2 + 1]; i = 0; while (i < envelopeLength) { var vector = new float[_envelopes.Length * (_modulationFftSize / 2 + 1)]; var offset = 0; foreach (var envelope in _envelopes) { zeroModblock.FastCopyTo(modBlock, _modulationFftSize); envelope.FastCopyTo(modBlock, Math.Min(_modulationFftSize, envelopeLength - i), i); modulationFft.PowerSpectrum(modBlock, modSpectrum); modSpectrum.FastCopyTo(vector, modSpectrum.Length, 0, offset); offset += modSpectrum.Length; } featureVectors.Add(new FeatureVector { Features = vector, TimePosition = (double)i * hopSize / signal.SamplingRate }); i += _modulationHopSize; } return(featureVectors); }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override float[] ProcessFrame(float[] block) { const float MeanPower = 1e10f; const float Epsilon = 2.22e-16f; _step++; // fill zeros to fftSize if frameSize < fftSize for (var k = FrameSize; k < block.Length; block[k++] = 0) { ; } // 1) apply window block.ApplyWindow(_windowSamples); // 2) calculate power spectrum _fft.PowerSpectrum(block, _spectrum, false); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(_gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (_step == 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { _spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (_step >= 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { if (spectrumQ[j] > _spectrumQOut[j]) { _spectrumQOut[j] = LambdaA * _spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { _spectrumQOut[j] = LambdaB * _spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < _filteredSpectrumQ.Length; j++) { _filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - _spectrumQOut[j], 0.0f); if (_step == 2 * M) { _avgSpectrumQ1[j] = 0.9f * _filteredSpectrumQ[j]; _avgSpectrumQ2[j] = _filteredSpectrumQ[j]; } if (_filteredSpectrumQ[j] > _avgSpectrumQ1[j]) { _avgSpectrumQ1[j] = LambdaA * _avgSpectrumQ1[j] + (1 - LambdaA) * _filteredSpectrumQ[j]; } else { _avgSpectrumQ1[j] = LambdaB * _avgSpectrumQ1[j] + (1 - LambdaB) * _filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = _filteredSpectrumQ[j]; _avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * _spectrumQOut[j]) { _filteredSpectrumQ[j] = _avgSpectrumQ1[j]; } else { if (_filteredSpectrumQ[j] <= _avgSpectrumQ2[j]) { _filteredSpectrumQ[j] = MuT * _avgSpectrumQ2[j]; } } _avgSpectrumQ2[j] = Math.Max(_avgSpectrumQ2[j], threshold); _filteredSpectrumQ[j] = Math.Max(_filteredSpectrumQ[j], _avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < _spectrumS.Length; j++) { _spectrumS[j] = _filteredSpectrumQ[j] / Math.Max(spectrumQ[j], Epsilon); } for (var j = 0; j < _smoothedSpectrumS.Length; j++) { _smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, FilterBank.Length); k++, total++) { _smoothedSpectrumS[j] += _spectrumS[k]; } _smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = _smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += _smoothedSpectrum[j]; } _mean = LambdaMu * _mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] /= _mean; _smoothedSpectrum[j] *= MeanPower; } // ============================================================= // 5) nonlinearity (power ^ d or Log) if (_power != 0) { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Pow(_smoothedSpectrum[j], 1.0 / _power); } } else { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Log(_smoothedSpectrum[j] + Epsilon); } } // 6) dct-II (Norm = normalized) var pnccs = new float[FeatureCount]; _dct.DirectNorm(_smoothedSpectrum, pnccs); // wow, who knows, maybe it'll happen! if (_step == int.MaxValue - 1) { _step = 2 * M + 1; } return(pnccs); } // first 2*M vectors are zeros return(new float[FeatureCount]); }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="filterbank"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="includeEnergy"></param> /// <param name="dctType">"1", "1N", "2", "2N", "3", "3N", "4", "4N"</param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public MfccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int filterbankSize = 24, double lowFreq = 0, double highFreq = 0, int fftSize = 0, float[][] filterbank = null, int lifterSize = 0, double preEmphasis = 0, bool includeEnergy = false, string dctType = "2N", NonLinearityType nonLinearity = NonLinearityType.Log10, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; _lowFreq = lowFreq; _highFreq = highFreq; if (filterbank == null) { _blockSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, _blockSize, SamplingRate, _lowFreq, _highFreq); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = filterbank; filterbankSize = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = includeEnergy; // setup DCT: ============================================================================ _dctType = dctType; switch (dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '2': _dct = new Dct2(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: throw new ArgumentException("Only DCT-1, 2, 3 and 4 are supported!"); } if (dctType.Length > 1 && char.ToUpper(dctType[1]) == 'N') { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// S(implified)PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Mean power normalization /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _gammatoneFilterBank = FilterBanks.Erb(_filterbankSize, _fftSize, signal.SamplingRate, _lowFreq, _highFreq); // use power spectrum: foreach (var filter in _gammatoneFilterBank) { for (var j = 0; j < filter.Length; j++) { var ps = filter[j] * filter[j]; filter[j] = ps; } } var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); var gammatoneSpectrum = new float[_filterbankSize]; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var block = new float[fftSize]; // buffer for a signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset var spectrum = new float[fftSize / 2 + 1]; // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(_gammatoneFilterBank, spectrum, gammatoneSpectrum); // 4) mean power normalization: var sumPower = 0.0f; for (var j = 0; j < gammatoneSpectrum.Length; j++) { sumPower += gammatoneSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] *= meanPower / mean; } // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] = (float)Math.Pow(gammatoneSpectrum[j], d); } } else { for (var j = 0; j < gammatoneSpectrum.Length; j++) { gammatoneSpectrum[j] = (float)Math.Log10(gammatoneSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var spnccs = new float[FeatureCount]; dct.DirectN(gammatoneSpectrum, spnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = spnccs, TimePosition = (double)i / signal.SamplingRate }); i += hopSize; } return(featureVectors); }
/// <summary> /// Constructor /// </summary> /// <param name="options">MFCC options</param> public MfccExtractor(MfccOptions options) : base(options) { FeatureCount = options.FeatureCount; var filterbankSize = options.FilterBankSize; if (options.FilterBank == null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _lifterSize = options.LifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = options.IncludeEnergy; _logEnergyFloor = options.LogEnergyFloor; // setup DCT: ============================================================================ _dctType = options.DctType; switch (_dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: _dct = new Dct2(filterbankSize); break; } if (_dctType.EndsWith("N", StringComparison.OrdinalIgnoreCase)) { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// PNCC algorithm according to [Kim & Stern, 2016]: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum /// 3) Apply gammatone filters (squared) /// 4) Medium-time processing (asymmetric noise suppression, temporal masking, spectral smoothing) /// 5) Apply nonlinearity /// 6) Do dct-II (normalized) /// /// </summary> /// <param name="samples">Samples for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of pncc vectors</returns> public override List <FeatureVector> ComputeFrom(float[] samples, int startSample, int endSample) { Guard.AgainstInvalidRange(startSample, endSample, "starting pos", "ending pos"); var hopSize = HopSize; var frameSize = FrameSize; const float meanPower = 1e10f; var mean = 4e07f; var d = _power != 0 ? 1.0 / _power : 0.0; var prevSample = startSample > 0 ? samples[startSample - 1] : 0.0f; var featureVectors = new List <FeatureVector>(); var i = 0; var timePos = startSample; while (timePos + frameSize < endSample) { // prepare next block for processing _zeroblock.FastCopyTo(_block, _fftSize); samples.FastCopyTo(_block, frameSize, timePos); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = _block[k] - prevSample * _preEmphasis; prevSample = _block[k]; _block[k] = y; } prevSample = samples[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { _block.ApplyWindow(_windowSamples); } // 2) calculate power spectrum _fft.PowerSpectrum(_block, _spectrum); // 3) apply gammatone filterbank FilterBanks.Apply(FilterBank, _spectrum, _gammatoneSpectrum); // ============================================================= // 4) medium-time processing blocks: // 4.1) temporal integration (zero-phase moving average filter) _ringBuffer.Add(_gammatoneSpectrum); var spectrumQ = _ringBuffer.AverageSpectrum; // 4.2) asymmetric noise suppression if (i == 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { _spectrumQOut[j] = spectrumQ[j] * 0.9f; } } if (i >= 2 * M) { for (var j = 0; j < _spectrumQOut.Length; j++) { if (spectrumQ[j] > _spectrumQOut[j]) { _spectrumQOut[j] = LambdaA * _spectrumQOut[j] + (1 - LambdaA) * spectrumQ[j]; } else { _spectrumQOut[j] = LambdaB * _spectrumQOut[j] + (1 - LambdaB) * spectrumQ[j]; } } for (var j = 0; j < _filteredSpectrumQ.Length; j++) { _filteredSpectrumQ[j] = Math.Max(spectrumQ[j] - _spectrumQOut[j], 0.0f); if (i == 2 * M) { _avgSpectrumQ1[j] = 0.9f * _filteredSpectrumQ[j]; _avgSpectrumQ2[j] = _filteredSpectrumQ[j]; } if (_filteredSpectrumQ[j] > _avgSpectrumQ1[j]) { _avgSpectrumQ1[j] = LambdaA * _avgSpectrumQ1[j] + (1 - LambdaA) * _filteredSpectrumQ[j]; } else { _avgSpectrumQ1[j] = LambdaB * _avgSpectrumQ1[j] + (1 - LambdaB) * _filteredSpectrumQ[j]; } // 4.3) temporal masking var threshold = _filteredSpectrumQ[j]; _avgSpectrumQ2[j] *= LambdaT; if (spectrumQ[j] < C * _spectrumQOut[j]) { _filteredSpectrumQ[j] = _avgSpectrumQ1[j]; } else { if (_filteredSpectrumQ[j] <= _avgSpectrumQ2[j]) { _filteredSpectrumQ[j] = MuT * _avgSpectrumQ2[j]; } } _avgSpectrumQ2[j] = Math.Max(_avgSpectrumQ2[j], threshold); _filteredSpectrumQ[j] = Math.Max(_filteredSpectrumQ[j], _avgSpectrumQ1[j]); } // 4.4) spectral smoothing for (var j = 0; j < _spectrumS.Length; j++) { _spectrumS[j] = _filteredSpectrumQ[j] / Math.Max(spectrumQ[j], float.Epsilon); } for (var j = 0; j < _smoothedSpectrumS.Length; j++) { _smoothedSpectrumS[j] = 0.0f; var total = 0; for (var k = Math.Max(j - N, 0); k < Math.Min(j + N + 1, _filterbankSize); k++, total++) { _smoothedSpectrumS[j] += _spectrumS[k]; } _smoothedSpectrumS[j] /= total; } // 4.5) mean power normalization var centralSpectrum = _ringBuffer.CentralSpectrum; var sumPower = 0.0f; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = _smoothedSpectrumS[j] * centralSpectrum[j]; sumPower += _smoothedSpectrum[j]; } mean = LambdaMu * mean + (1 - LambdaMu) * sumPower; for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] *= meanPower / mean; } // ============================================================= // 5) nonlinearity (power ^ d or Log10) if (_power != 0) { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Pow(_smoothedSpectrum[j], d); } } else { for (var j = 0; j < _smoothedSpectrum.Length; j++) { _smoothedSpectrum[j] = (float)Math.Log10(_smoothedSpectrum[j] + float.Epsilon); } } // 6) dct-II (normalized) var pnccs = new float[FeatureCount]; _dct.DirectN(_smoothedSpectrum, pnccs); // add pncc vector to output sequence featureVectors.Add(new FeatureVector { Features = pnccs, TimePosition = (double)timePos / SamplingRate }); } i++; timePos += hopSize; } return(featureVectors); }
/// <summary> /// Main constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="frameDuration">In seconds</param> /// <param name="hopDuration">In seconds</param> /// <param name="modulationFftSize">In samples</param> /// <param name="modulationHopSize">In samples</param> /// <param name="fftSize">In samples</param> /// <param name="featuregram"></param> /// <param name="filterbank"></param> /// <param name="preEmphasis"></param> /// <param name="window"></param> public AmsExtractor(int samplingRate, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int modulationFftSize = 64, int modulationHopSize = 4, int fftSize = 0, IEnumerable <float[]> featuregram = null, float[][] filterbank = null, double preEmphasis = 0.0, WindowTypes window = WindowTypes.Rectangular) : base(samplingRate, frameDuration, hopDuration) { _modulationFftSize = modulationFftSize; _modulationHopSize = modulationHopSize; _modulationFft = new Fft(_modulationFftSize); _featuregram = featuregram?.ToArray(); if (featuregram != null) { _featureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } else { if (_filterbank == null) { _fftSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbank = FilterBanks.Triangular(_fftSize, samplingRate, FilterBanks.MelBands(12, _fftSize, samplingRate, 100, 3200)); } else { _filterbank = filterbank; _fftSize = 2 * (filterbank[0].Length - 1); } _fft = new Fft(_fftSize); _featureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); _window = window; if (_window != WindowTypes.Rectangular) { _windowSamples = Window.OfType(_window, FrameSize); } _spectrum = new float[_fftSize / 2 + 1]; _filteredSpectrum = new float[_filterbank.Length]; _block = new float[_fftSize]; _zeroblock = new float[_fftSize]; } _preEmphasis = (float)preEmphasis; _modBlock = new float[_modulationFftSize]; _zeroModblock = new float[_modulationFftSize]; _modSpectrum = new float[_modulationFftSize / 2 + 1]; // feature descriptions int length; if (_featuregram != null) { length = _featuregram[0].Length; } else { length = _filterbank.Length; } _featureDescriptions = new List <string>(); var modulationSamplingRate = (float)samplingRate / HopSize; var resolution = modulationSamplingRate / _modulationFftSize; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { _featureDescriptions.Add(string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution)); } } }
/// <summary> /// Main constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration">Length of analysis window (in seconds)</param> /// <param name="hopDuration">Length of overlap (in seconds)</param> /// <param name="power"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="filterbankSize"></param> /// <param name="filterbank"></param> /// <param name="fftSize">Size of FFT (in samples)</param> /// <param name="preEmphasis"></param> /// <param name="window"></param> public SpnccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int power = 15, double lowFreq = 100, double highFreq = 6800, int filterbankSize = 40, float[][] filterbank = null, int fftSize = 0, double preEmphasis = 0.0, WindowTypes window = WindowTypes.Hamming) : base(samplingRate, frameDuration, hopDuration) { FeatureCount = featureCount; _power = power; if (filterbank == null) { _fftSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbankSize = filterbankSize; _lowFreq = lowFreq; _highFreq = highFreq; FilterBank = FilterBanks.Erb(_filterbankSize, _fftSize, samplingRate, _lowFreq, _highFreq); // use power spectrum: foreach (var filter in FilterBank) { for (var j = 0; j < filter.Length; j++) { var ps = filter[j] * filter[j]; filter[j] = ps; } } } else { FilterBank = filterbank; _filterbankSize = filterbank.Length; _fftSize = 2 * (filterbank[0].Length - 1); } _fft = new Fft(_fftSize); _dct = new Dct2(_filterbankSize, FeatureCount); _preEmphasis = (float)preEmphasis; _window = window; if (_window != WindowTypes.Rectangular) { _windowSamples = Window.OfType(_window, FrameSize); } _block = new float[_fftSize]; _spectrum = new float[_fftSize / 2 + 1]; _filteredSpectrum = new float[_filterbankSize]; _zeroblock = new float[_fftSize]; }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureList"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="fftSize"></param> /// <param name="frequencyBands"></param> /// <param name="parameters"></param> public Mpeg7SpectralFeaturesExtractor(int samplingRate, string featureList, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int fftSize = 0, Tuple <double, double, double>[] frequencyBands = null, WindowTypes window = WindowTypes.Hamming, IReadOnlyDictionary <string, object> parameters = null) : base(samplingRate, frameDuration, hopDuration) { if (featureList == "all" || featureList == "full") { featureList = FeatureSet; } var features = featureList.Split(',', '+', '-', ';', ':') .Select(f => f.Trim().ToLower()); _extractors = features.Select <string, Func <float[], float[], float> >(feature => { switch (feature) { case "sc": case "centroid": return(Spectral.Centroid); case "ss": case "spread": return(Spectral.Spread); case "sfm": case "flatness": if (parameters?.ContainsKey("minLevel") ?? false) { var minLevel = (float)parameters["minLevel"]; return((spectrum, freqs) => Spectral.Flatness(spectrum, minLevel)); } else { return((spectrum, freqs) => Spectral.Flatness(spectrum)); } case "sn": case "noiseness": if (parameters?.ContainsKey("noiseFrequency") ?? false) { var noiseFrequency = (float)parameters["noiseFrequency"]; return((spectrum, freqs) => Spectral.Noiseness(spectrum, freqs, noiseFrequency)); } else { return((spectrum, freqs) => Spectral.Noiseness(spectrum, freqs)); } case "rolloff": if (parameters?.ContainsKey("rolloffPercent") ?? false) { var rolloffPercent = (float)parameters["rolloffPercent"]; return((spectrum, freqs) => Spectral.Rolloff(spectrum, freqs, rolloffPercent)); } else { return((spectrum, freqs) => Spectral.Rolloff(spectrum, freqs)); } case "crest": return((spectrum, freqs) => Spectral.Crest(spectrum)); case "entropy": case "ent": return((spectrum, freqs) => Spectral.Entropy(spectrum)); case "sd": case "decrease": return((spectrum, freqs) => Spectral.Decrease(spectrum)); case "loud": case "loudness": return((spectrum, freqs) => Perceptual.Loudness(spectrum)); case "sharp": case "sharpness": return((spectrum, freqs) => Perceptual.Sharpness(spectrum)); default: return((spectrum, freqs) => 0); } }).ToList(); FeatureDescriptions = features.ToList(); _fftSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); _fft = new Fft(_fftSize); _window = window; if (_window != WindowTypes.Rectangular) { _windowSamples = Window.OfType(_window, FrameSize); } _frequencyBands = frequencyBands ?? FilterBanks.OctaveBands(6, _fftSize, samplingRate); _filterbank = FilterBanks.Rectangular(_fftSize, samplingRate, _frequencyBands); var cfs = _frequencyBands.Select(b => b.Item2).ToList(); // insert zero frequency so that it'll be ignored during calculations // just like in case of FFT spectrum (0th DC component) cfs.Insert(0, 0); _frequencies = cfs.ToFloats(); _parameters = parameters; // reserve memory for reusable blocks _spectrum = new float[_fftSize / 2 + 1]; // buffer for magnitude spectrum _mappedSpectrum = new float[_filterbank.Length + 1]; // buffer for total energies in bands _block = new float[_fftSize]; // buffer for currently processed block _zeroblock = new float[_fftSize]; // just a buffer of zeros for quick memset }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="lpcOrder"></param> /// <param name="rasta"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="window"></param> /// <param name="filterbank"></param> /// <param name="centerFrequencies"></param> public PlpExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int lpcOrder = 0, // will be autocalculated as featureCount - 1 double rasta = 0, int filterbankSize = 24, double lowFreq = 0, double highFreq = 0, int fftSize = 0, int lifterSize = 0, double preEmphasis = 0, WindowTypes window = WindowTypes.Hamming, float[][] filterbank = null, double[] centerFrequencies = null) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; // ================================ Prepare filter bank and center frequencies: =========================================== _lowFreq = lowFreq; _highFreq = highFreq; if (filterbank == null) { _blockSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); var barkBands = FilterBanks.BarkBandsSlaney(filterbankSize, _blockSize, samplingRate, _lowFreq, _highFreq); FilterBank = FilterBanks.BarkBankSlaney(filterbankSize, _blockSize, samplingRate, _lowFreq, _highFreq); _centerFrequencies = barkBands.Select(b => b.Item2).ToArray(); } else { FilterBank = filterbank; filterbankSize = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); if (centerFrequencies != null) { _centerFrequencies = centerFrequencies; } else { var herzResolution = (double)samplingRate / _blockSize; // try to determine center frequencies automatically from filterbank weights: _centerFrequencies = new double[filterbankSize]; for (var i = 0; i < filterbank.Length; i++) { var minPos = 0; var maxPos = _blockSize / 2; for (var j = 0; j < filterbank[i].Length; j++) { if (filterbank[i][j] > 0) { minPos = j; break; } } for (var j = minPos; j < filterbank[i].Length; j++) { if (filterbank[i][j] == 0) { maxPos = j; break; } } _centerFrequencies[i] = herzResolution * (maxPos + minPos) / 2; } } } // ==================================== Compute equal loudness curve: ========================================= _equalLoudnessCurve = new double[filterbankSize]; for (var i = 0; i < _centerFrequencies.Length; i++) { var level2 = _centerFrequencies[i] * _centerFrequencies[i]; _equalLoudnessCurve[i] = Math.Pow(level2 / (level2 + 1.6e5), 2) * ((level2 + 1.44e6) / (level2 + 9.61e6)); } // ============================== Prepare RASTA filters (if necessary): ======================================= _rasta = rasta; if (rasta > 0) { _rastaFilters = Enumerable.Range(0, filterbankSize) .Select(f => new RastaFilter(rasta)) .ToArray(); } // ============== Precompute IDFT table for obtaining autocorrelation coeffs from power spectrum: ============= _lpcOrder = lpcOrder > 0 ? lpcOrder : FeatureCount - 1; _idftTable = new float[_lpcOrder + 1][]; var bandCount = filterbankSize + 2; // +2 duplicated edges var freq = Math.PI / (bandCount - 1); for (var i = 0; i < _idftTable.Length; i++) { _idftTable[i] = new float[bandCount]; _idftTable[i][0] = 1.0f; for (var j = 1; j < bandCount - 1; j++) { _idftTable[i][j] = 2 * (float)Math.Cos(freq * i * j); } _idftTable[i][bandCount - 1] = (float)Math.Cos(freq * i * (bandCount - 1)); } _lpc = new float[_lpcOrder + 1]; _cc = new float[bandCount]; // =================================== Prepare everything else: ============================== _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _spectrum = new float[_blockSize / 2 + 1]; _bandSpectrum = new float[filterbankSize]; }