/// <summary> /// Main constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="filterbank"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="window"></param> public MfccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int filterbankSize = 20, double lowFreq = 0, double highFreq = 0, int fftSize = 0, float[][] filterbank = null, int lifterSize = 22, double preEmphasis = 0.0, WindowTypes window = WindowTypes.Hamming) : base(samplingRate, frameDuration, hopDuration) { FeatureCount = featureCount; if (filterbank == null) { _fftSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbankSize = filterbankSize; _lowFreq = lowFreq; _highFreq = highFreq; FilterBank = FilterBanks.Triangular(_fftSize, SamplingRate, FilterBanks.MelBands(_filterbankSize, _fftSize, SamplingRate, _lowFreq, _highFreq)); } else { FilterBank = filterbank; _filterbankSize = filterbank.Length; _fftSize = 2 * (filterbank[0].Length - 1); } _fft = new Fft(_fftSize); _dct = new Dct2(_filterbankSize, FeatureCount); _window = window; if (_window != WindowTypes.Rectangular) { _windowSamples = Window.OfType(_window, FrameSize); } _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _preEmphasis = (float)preEmphasis; // reserve memory for reusable blocks _spectrum = new float[_fftSize / 2 + 1]; _logMelSpectrum = new float[_filterbankSize]; _block = new float[_fftSize]; _zeroblock = new float[_fftSize]; }
private async void openToolStripMenuItem_Click(object sender, EventArgs e) { var ofd = new OpenFileDialog(); if (ofd.ShowDialog() != DialogResult.OK) { return; } using (var stream = new FileStream(ofd.FileName, FileMode.Open)) { var waveFile = new WaveFile(stream, true); _signal = waveFile[Channels.Left]; } var sr = _signal.SamplingRate; var barkbands = FilterBanks.BarkBands(16, 512, sr, 100 /*Hz*/, 6500 /*Hz*/, overlap: false); var barkbank = FilterBanks.Triangular(512, sr, barkbands); var mfccExtractor = new MfccExtractor(_signal.SamplingRate, 13, //filterbankSize: 40, //lowFreq: 100, //highFreq: 4200, //lifterSize: 22, preEmphasis: 0.97, //filterbank: barkbank, window: WindowTypes.Hamming); _mfccVectors = mfccExtractor.ComputeFrom(_signal); //FeaturePostProcessing.NormalizeMean(_mfccVectors); // optional //FeaturePostProcessing.AddDeltas(_mfccVectors); FillFeaturesList(_mfccVectors, mfccExtractor.FeatureDescriptions); mfccListView.Items[0].Selected = true; melFilterBankPanel.Groups = mfccExtractor.FilterBank; mfccPanel.Line = _mfccVectors[0].Features; using (var csvFile = new FileStream("mfccs.csv", FileMode.Create)) { var header = mfccExtractor.FeatureDescriptions; //.Concat(mfccExtractor.DeltaFeatureDescriptions) //.Concat(mfccExtractor.DeltaDeltaFeatureDescriptions); var serializer = new CsvFeatureSerializer(_mfccVectors, header); await serializer.SerializeAsync(csvFile); } }
private static float[][] MakeFilterbank(int filterbankSize, int samplingRate, int fftSize, double frameDuration, double lowFreq = 0, double highFreq = 0) { var frameSize = (int)(frameDuration * samplingRate); fftSize = fftSize > frameSize ? fftSize : MathUtils.NextPowerOfTwo(frameSize); var melBands = FilterBanks.MelBands(filterbankSize, fftSize, samplingRate, lowFreq, highFreq); return(FilterBanks.Triangular(fftSize, samplingRate, melBands, null, Scale.HerzToMel)); }
public MfccHtkOptions(int samplingRate, int featureCount, double frameDuration, double lowFrequency = 0, double highFrequency = 0, int filterbankSize = 24, int fftSize = 0) { var frameSize = (int)(frameDuration * samplingRate); fftSize = fftSize > frameSize ? fftSize : MathUtils.NextPowerOfTwo(frameSize); var melBands = FilterBanks.MelBands(filterbankSize, samplingRate, lowFrequency, highFrequency); FilterBank = FilterBanks.Triangular(fftSize, samplingRate, melBands, null, Scale.HerzToMel); FilterBankSize = filterbankSize; FeatureCount = featureCount; FftSize = fftSize; SamplingRate = samplingRate; LowFrequency = lowFrequency; HighFrequency = highFrequency; NonLinearity = NonLinearityType.LogE; LogFloor = 1.0f; }
/// <summary> /// Constructor /// </summary> /// <param name="options">AMS options</param> public AmsExtractor(AmsOptions options) : base(options) { _modulationFftSize = options.ModulationFftSize; _modulationHopSize = options.ModulationHopSize; _modulationFft = new RealFft(_modulationFftSize); _featuregram = options.Featuregram?.ToArray(); if (_featuregram != null) { FeatureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } else { if (options.FilterBank == null) { _fftSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbank = FilterBanks.Triangular(_fftSize, SamplingRate, FilterBanks.MelBands(12, SamplingRate, 100, 3200)); } else { _filterbank = options.FilterBank; _fftSize = 2 * (_filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _fftSize, "frame size", "FFT size"); } _fft = new RealFft(_fftSize); FeatureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); _spectrum = new float[_fftSize / 2 + 1]; _filteredSpectrum = new float[_filterbank.Length]; _block = new float[_fftSize]; } _modBlock = new float[_modulationFftSize]; _modSpectrum = new float[_modulationFftSize / 2 + 1]; // feature descriptions int length; if (_featuregram != null) { length = _featuregram[0].Length; } else { length = _filterbank.Length; } FeatureDescriptions = new List <string>(); var modulationSamplingRate = (float)SamplingRate / HopSize; var resolution = modulationSamplingRate / _modulationFftSize; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { FeatureDescriptions.Add(string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution)); } } }
/// <summary> /// Constructs extractor from configuration <paramref name="options"/>. /// </summary> public MfccExtractor(MfccOptions options) : base(options) { FeatureCount = options.FeatureCount; var filterbankSize = options.FilterBankSize; if (options.FilterBank is null) { _blockSize = options.FftSize > FrameSize ? options.FftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, SamplingRate, options.LowFrequency, options.HighFrequency); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = options.FilterBank; filterbankSize = FilterBank.Length; _blockSize = 2 * (FilterBank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _lifterSize = options.LifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = options.IncludeEnergy; _logEnergyFloor = options.LogEnergyFloor; // setup DCT: ============================================================================ _dctType = options.DctType; switch (_dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: _dct = new Dct2(filterbankSize); break; } if (_dctType.EndsWith("N", StringComparison.OrdinalIgnoreCase)) { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = options.LogFloor; _nonLinearityType = options.NonLinearity; switch (_nonLinearityType) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = options.SpectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; default: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// Main constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="frameDuration">In seconds</param> /// <param name="hopDuration">In seconds</param> /// <param name="modulationFftSize">In samples</param> /// <param name="modulationHopSize">In samples</param> /// <param name="fftSize">In samples</param> /// <param name="featuregram"></param> /// <param name="filterbank"></param> /// <param name="preEmphasis"></param> /// <param name="window"></param> public AmsExtractor(int samplingRate, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int modulationFftSize = 64, int modulationHopSize = 4, int fftSize = 0, IEnumerable <float[]> featuregram = null, float[][] filterbank = null, double preEmphasis = 0.0, WindowTypes window = WindowTypes.Rectangular) : base(samplingRate, frameDuration, hopDuration) { _modulationFftSize = modulationFftSize; _modulationHopSize = modulationHopSize; _modulationFft = new Fft(_modulationFftSize); _featuregram = featuregram?.ToArray(); if (featuregram != null) { _featureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } else { if (_filterbank == null) { _fftSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); _filterbank = FilterBanks.Triangular(_fftSize, samplingRate, FilterBanks.MelBands(12, _fftSize, samplingRate, 100, 3200)); } else { _filterbank = filterbank; _fftSize = 2 * (filterbank[0].Length - 1); } _fft = new Fft(_fftSize); _featureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); _window = window; if (_window != WindowTypes.Rectangular) { _windowSamples = Window.OfType(_window, FrameSize); } _spectrum = new float[_fftSize / 2 + 1]; _filteredSpectrum = new float[_filterbank.Length]; _block = new float[_fftSize]; _zeroblock = new float[_fftSize]; } _preEmphasis = (float)preEmphasis; _modBlock = new float[_modulationFftSize]; _zeroModblock = new float[_modulationFftSize]; _modSpectrum = new float[_modulationFftSize / 2 + 1]; // feature descriptions int length; if (_featuregram != null) { length = _featuregram[0].Length; } else { length = _filterbank.Length; } _featureDescriptions = new List <string>(); var modulationSamplingRate = (float)samplingRate / HopSize; var resolution = modulationSamplingRate / _modulationFftSize; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { _featureDescriptions.Add(string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution)); } } }
private void buttonCompute_Click(object sender, EventArgs e) { var filterCount = int.Parse(textBoxSize.Text); var samplingRate = _signal.SamplingRate; var fftSize = int.Parse(textBoxFftSize.Text); var lowFreq = float.Parse(textBoxLowFreq.Text); var highFreq = float.Parse(textBoxHighFreq.Text); Tuple <double, double, double>[] bands; float[][] filterbank = null; VtlnWarper vtln = null; if (checkBoxVtln.Checked) { var alpha = float.Parse(textBoxVtlnAlpha.Text); var vtlnLow = float.Parse(textBoxVtlnLow.Text); var vtlnHigh = float.Parse(textBoxVtlnHigh.Text); vtln = new VtlnWarper(alpha, lowFreq, highFreq, vtlnLow, vtlnHigh); } switch (comboBoxFilterbank.Text) { case "Mel": bands = FilterBanks.MelBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "Mel Slaney": bands = FilterBanks.MelBandsSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); filterbank = FilterBanks.MelBankSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxNormalize.Checked, vtln); break; case "Bark": bands = FilterBanks.BarkBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "Bark Slaney": bands = FilterBanks.BarkBandsSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); filterbank = FilterBanks.BarkBankSlaney(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Critical bands": bands = FilterBanks.CriticalBands(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Octave bands": bands = FilterBanks.OctaveBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; case "ERB": bands = null; filterbank = FilterBanks.Erb(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; default: bands = FilterBanks.HerzBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, checkBoxOverlap.Checked); break; } if (bands != null && filterbank == null) { switch (comboBoxShape.Text) { case "Triangular": filterbank = FilterBanks.Triangular(fftSize, samplingRate, bands, vtln, Utils.Scale.HerzToMel); break; case "Trapezoidal": filterbank = FilterBanks.Trapezoidal(fftSize, samplingRate, bands, vtln); break; case "BiQuad": filterbank = FilterBanks.BiQuad(fftSize, samplingRate, bands); break; default: filterbank = FilterBanks.Rectangular(fftSize, samplingRate, bands, vtln); break; } if (checkBoxNormalize.Checked) { FilterBanks.Normalize(filterCount, bands, filterbank); } } var spectrumType = (SpectrumType)comboBoxSpectrum.SelectedIndex; var nonLinearity = (NonLinearityType)comboBoxNonLinearity.SelectedIndex; var logFloor = float.Parse(textBoxLogFloor.Text); var mfccExtractor = new MfccExtractor(//samplingRate, 13, 0.025, 0.01, samplingRate, 13, 512.0 / samplingRate, 0.01, filterbank: filterbank, //filterbankSize: 26, //highFreq: 8000, //preEmphasis: 0.97, //lifterSize: 22, //includeEnergy: true, spectrumType: spectrumType, nonLinearity: nonLinearity, dctType: comboBoxDct.Text, window: WindowTypes.Hamming, logFloor: logFloor); _mfccVectors = mfccExtractor.ComputeFrom(_signal); //_mfccVectors = mfccExtractor.ComputeFrom(_signal * 32768); //var mfccVectorsP = mfccExtractor.ParallelComputeFrom(_signal * 32768); //for (var i = 0; i < _mfccVectors.Count; i++) //{ // for (var j = 0; j < _mfccVectors[i].Features.Length; j++) // { // if (Math.Abs(_mfccVectors[i].Features[j] - mfccVectorsP[i].Features[j]) > 1e-32f) // { // MessageBox.Show($"Nope: {i} - {j}"); // return; // } // if (Math.Abs(_mfccVectors[i].TimePosition - mfccVectorsP[i].TimePosition) > 1e-32f) // { // MessageBox.Show($"Time: {i} - {j}"); // return; // } // } //} //FeaturePostProcessing.NormalizeMean(_mfccVectors); // optional (but REQUIRED for PNCC!) //FeaturePostProcessing.AddDeltas(_mfccVectors); var header = mfccExtractor.FeatureDescriptions; //.Concat(mfccExtractor.DeltaFeatureDescriptions) //.Concat(mfccExtractor.DeltaDeltaFeatureDescriptions); FillFeaturesList(_mfccVectors, header); mfccListView.Items[0].Selected = true; melFilterBankPanel.Groups = mfccExtractor.FilterBank; mfccPanel.Line = _mfccVectors[0].Features; }
/// <summary> /// Constructor /// </summary> /// <param name="samplingRate"></param> /// <param name="featureCount"></param> /// <param name="frameDuration"></param> /// <param name="hopDuration"></param> /// <param name="filterbankSize"></param> /// <param name="lowFreq"></param> /// <param name="highFreq"></param> /// <param name="fftSize"></param> /// <param name="filterbank"></param> /// <param name="lifterSize"></param> /// <param name="preEmphasis"></param> /// <param name="includeEnergy"></param> /// <param name="dctType">"1", "1N", "2", "2N", "3", "3N", "4", "4N"</param> /// <param name="nonLinearity"></param> /// <param name="spectrumType"></param> /// <param name="window"></param> /// <param name="logFloor"></param> public MfccExtractor(int samplingRate, int featureCount, double frameDuration = 0.0256 /*sec*/, double hopDuration = 0.010 /*sec*/, int filterbankSize = 24, double lowFreq = 0, double highFreq = 0, int fftSize = 0, float[][] filterbank = null, int lifterSize = 0, double preEmphasis = 0, bool includeEnergy = false, string dctType = "2N", NonLinearityType nonLinearity = NonLinearityType.Log10, SpectrumType spectrumType = SpectrumType.Power, WindowTypes window = WindowTypes.Hamming, float logFloor = float.Epsilon) : base(samplingRate, frameDuration, hopDuration, preEmphasis) { FeatureCount = featureCount; _lowFreq = lowFreq; _highFreq = highFreq; if (filterbank == null) { _blockSize = fftSize > FrameSize ? fftSize : MathUtils.NextPowerOfTwo(FrameSize); var melBands = FilterBanks.MelBands(filterbankSize, _blockSize, SamplingRate, _lowFreq, _highFreq); FilterBank = FilterBanks.Triangular(_blockSize, SamplingRate, melBands, mapper: Scale.HerzToMel); // HTK/Kaldi-style } else { FilterBank = filterbank; filterbankSize = filterbank.Length; _blockSize = 2 * (filterbank[0].Length - 1); Guard.AgainstExceedance(FrameSize, _blockSize, "frame size", "FFT size"); } _fft = new RealFft(_blockSize); _window = window; _windowSamples = Window.OfType(_window, FrameSize); _lifterSize = lifterSize; _lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; _includeEnergy = includeEnergy; // setup DCT: ============================================================================ _dctType = dctType; switch (dctType[0]) { case '1': _dct = new Dct1(filterbankSize); break; case '2': _dct = new Dct2(filterbankSize); break; case '3': _dct = new Dct3(filterbankSize); break; case '4': _dct = new Dct4(filterbankSize); break; default: throw new ArgumentException("Only DCT-1, 2, 3 and 4 are supported!"); } if (dctType.Length > 1 && char.ToUpper(dctType[1]) == 'N') { _applyDct = mfccs => _dct.DirectNorm(_melSpectrum, mfccs); } else { _applyDct = mfccs => _dct.Direct(_melSpectrum, mfccs); } // setup spectrum post-processing: ======================================================= _logFloor = logFloor; _nonLinearityType = nonLinearity; switch (nonLinearity) { case NonLinearityType.Log10: _postProcessSpectrum = () => FilterBanks.ApplyAndLog10(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.LogE: _postProcessSpectrum = () => FilterBanks.ApplyAndLog(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.ToDecibel: _postProcessSpectrum = () => FilterBanks.ApplyAndToDecibel(FilterBank, _spectrum, _melSpectrum, _logFloor); break; case NonLinearityType.CubicRoot: _postProcessSpectrum = () => FilterBanks.ApplyAndPow(FilterBank, _spectrum, _melSpectrum, 0.33); break; default: _postProcessSpectrum = () => FilterBanks.Apply(FilterBank, _spectrum, _melSpectrum); break; } _spectrumType = spectrumType; switch (_spectrumType) { case SpectrumType.Magnitude: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, false); break; case SpectrumType.Power: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, false); break; case SpectrumType.MagnitudeNormalized: _getSpectrum = block => _fft.MagnitudeSpectrum(block, _spectrum, true); break; case SpectrumType.PowerNormalized: _getSpectrum = block => _fft.PowerSpectrum(block, _spectrum, true); break; } // reserve memory for reusable blocks _spectrum = new float[_blockSize / 2 + 1]; _melSpectrum = new float[filterbankSize]; }
/// <summary> /// Method for computing modulation spectra. /// Each vector representing one modulation spectrum is a flattened version of 2D spectrum. /// </summary> /// <param name="signal">Signal under analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of flattened modulation spectra</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); var fft = new Fft(fftSize); var modulationFft = new Fft(_modulationFftSize); if (_featuregram == null) { if (_filterbank == null) { _filterbank = FilterBanks.Triangular(_fftSize, signal.SamplingRate, FilterBanks.MelBands(12, _fftSize, signal.SamplingRate, 100, 3200)); } _featureCount = _filterbank.Length * (_modulationFftSize / 2 + 1); } else { _featureCount = _featuregram[0].Length * (_modulationFftSize / 2 + 1); } var length = _filterbank?.Length ?? _featuregram[0].Length; var modulationSamplingRate = (float)signal.SamplingRate / hopSize; var resolution = modulationSamplingRate / _modulationFftSize; _featureDescriptions = new string[length * (_modulationFftSize / 2 + 1)]; var idx = 0; for (var fi = 0; fi < length; fi++) { for (var fj = 0; fj <= _modulationFftSize / 2; fj++) { _featureDescriptions[idx++] = string.Format("band_{0}_mf_{1:F2}_Hz", fi + 1, fj * resolution); } } // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { var preemphasisFilter = new PreEmphasisFilter(_preEmphasis); signal = preemphasisFilter.ApplyTo(signal); } // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var en = 0; var i = startSample; if (_featuregram == null) { _envelopes = new float[_filterbank.Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[signal.Length / hopSize]; } var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; // ===================== compute local FFTs (do STFT) ======================= var spectrum = new float[fftSize / 2 + 1]; var filteredSpectrum = new float[_filterbank.Length]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // buffer of zeros for quick memset while (i + frameSize < endSample) { zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, frameSize, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply filterbank... FilterBanks.Apply(_filterbank, spectrum, filteredSpectrum); // ...and save results for future calculations for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n][en] = filteredSpectrum[n]; } en++; i += hopSize; } } else { en = _featuregram.Length; _envelopes = new float[_featuregram[0].Length][]; for (var n = 0; n < _envelopes.Length; n++) { _envelopes[n] = new float[en]; for (i = 0; i < en; i++) { _envelopes[n][i] = _featuregram[i][n]; } } } // =========================== modulation analysis ======================= var envelopeLength = en; // long-term AVG-normalization foreach (var envelope in _envelopes) { var avg = 0.0f; for (var k = 0; k < envelopeLength; k++) { avg += (k >= 0) ? envelope[k] : -envelope[k]; } avg /= envelopeLength; if (avg >= 1e-10) // this happens more frequently { for (var k = 0; k < envelopeLength; k++) { envelope[k] /= avg; } } } var modBlock = new float[_modulationFftSize]; var zeroModblock = new float[_modulationFftSize]; var modSpectrum = new float[_modulationFftSize / 2 + 1]; i = 0; while (i < envelopeLength) { var vector = new float[_envelopes.Length * (_modulationFftSize / 2 + 1)]; var offset = 0; foreach (var envelope in _envelopes) { zeroModblock.FastCopyTo(modBlock, _modulationFftSize); envelope.FastCopyTo(modBlock, Math.Min(_modulationFftSize, envelopeLength - i), i); modulationFft.PowerSpectrum(modBlock, modSpectrum); modSpectrum.FastCopyTo(vector, modSpectrum.Length, 0, offset); offset += modSpectrum.Length; } featureVectors.Add(new FeatureVector { Features = vector, TimePosition = (double)i * hopSize / signal.SamplingRate }); i += _modulationHopSize; } return(featureVectors); }
/// <summary> /// Standard method for computing mfcc features: /// 0) [Optional] pre-emphasis /// /// Decompose signal into overlapping (hopSize) frames of length fftSize. In each frame do: /// /// 1) Apply window (if rectangular window was specified then just do nothing) /// 2) Obtain power spectrum X /// 3) Apply mel filters and log() the result: Y = Log10(X * H) /// 4) Do dct-II: mfcc = Dct(Y) /// 5) [Optional] liftering of mfcc /// /// </summary> /// <param name="signal">Signal for analysis</param> /// <param name="startSample">The number (position) of the first sample for processing</param> /// <param name="endSample">The number (position) of last sample for processing</param> /// <returns>List of mfcc vectors</returns> public override List <FeatureVector> ComputeFrom(DiscreteSignal signal, int startSample, int endSample) { // ====================================== PREPARE ======================================= var hopSize = (int)(signal.SamplingRate * HopSize); var frameSize = (int)(signal.SamplingRate * FrameSize); var windowSamples = Window.OfType(_window, frameSize); var fftSize = _fftSize >= frameSize ? _fftSize : MathUtils.NextPowerOfTwo(frameSize); _melFilterBank = FilterBanks.Triangular(fftSize, signal.SamplingRate, FilterBanks.MelBands(_filterbankSize, fftSize, signal.SamplingRate, _lowFreq, _highFreq)); var lifterCoeffs = _lifterSize > 0 ? Window.Liftering(FeatureCount, _lifterSize) : null; var fft = new Fft(fftSize); var dct = new Dct2(_filterbankSize, FeatureCount); // reserve memory for reusable blocks var spectrum = new float[fftSize / 2 + 1]; var logMelSpectrum = new float[_filterbankSize]; var block = new float[fftSize]; // buffer for currently processed signal block at each step var zeroblock = new float[fftSize]; // just a buffer of zeros for quick memset // ================================= MAIN PROCESSING ================================== var featureVectors = new List <FeatureVector>(); var prevSample = startSample > 0 ? signal[startSample - 1] : 0.0f; var i = startSample; while (i + frameSize < endSample) { // prepare next block for processing zeroblock.FastCopyTo(block, zeroblock.Length); signal.Samples.FastCopyTo(block, windowSamples.Length, i); // 0) pre-emphasis (if needed) if (_preEmphasis > 0.0) { for (var k = 0; k < frameSize; k++) { var y = block[k] - prevSample * _preEmphasis; prevSample = block[k]; block[k] = y; } prevSample = signal[i + hopSize - 1]; } // 1) apply window if (_window != WindowTypes.Rectangular) { block.ApplyWindow(windowSamples); } // 2) calculate power spectrum fft.PowerSpectrum(block, spectrum); // 3) apply mel filterbank and take log() of the result FilterBanks.ApplyAndLog(_melFilterBank, spectrum, logMelSpectrum); // 4) dct-II var mfccs = new float[FeatureCount]; dct.Direct(logMelSpectrum, mfccs); // 5) (optional) liftering if (lifterCoeffs != null) { mfccs.ApplyWindow(lifterCoeffs); } // add mfcc vector to output sequence featureVectors.Add(new FeatureVector { Features = mfccs, TimePosition = (double)i / signal.SamplingRate }); i += hopSize; } return(featureVectors); }
private void filterbankButton_Click(object sender, EventArgs e) { var filterCount = int.Parse(filterCountTextBox.Text); var samplingRate = int.Parse(samplingRateTextBox.Text); var fftSize = int.Parse(fftSizeTextBox.Text); var lowFreq = float.Parse(lowFreqTextBox.Text); var highFreq = float.Parse(highFreqTextBox.Text); Tuple <double, double, double>[] bands; switch (filterbankComboBox.Text) { case "Mel": bands = FilterBanks.MelBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "Bark": bands = FilterBanks.BarkBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "Critical bands": bands = FilterBanks.CriticalBands(filterCount, fftSize, samplingRate, lowFreq, highFreq); break; case "Octave bands": bands = FilterBanks.OctaveBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; case "ERB": bands = null; _filterbank = FilterBanks.Erb(filterCount, fftSize, samplingRate, lowFreq, highFreq); // ==================================================== // =================== ! SQUARE ! ==================== //foreach (var filter in _filterbank) //{ // for (var j = 0; j < filter.Length; j++) // { // var squared = filter[j] * filter[j]; // filter[j] = squared; // } //} // normalization coefficient (for plotting) var scaleCoeff = (int)(1.0 / _filterbank.Max(f => f.Max())); filterbankPanel.Gain = 100 * scaleCoeff; break; default: bands = FilterBanks.HerzBands(filterCount, fftSize, samplingRate, lowFreq, highFreq, overlapCheckBox.Checked); break; } if (bands != null) { switch (shapeComboBox.Text) { case "Triangular": _filterbank = FilterBanks.Triangular(fftSize, samplingRate, bands); break; case "Trapezoidal": _filterbank = FilterBanks.Trapezoidal(fftSize, samplingRate, bands); break; case "BiQuad": _filterbank = FilterBanks.BiQuad(fftSize, samplingRate, bands); break; default: _filterbank = FilterBanks.Rectangular(fftSize, samplingRate, bands); break; } } band1ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band2ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band3ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band4ComboBox.DataSource = Enumerable.Range(1, filterCount).ToArray(); band1ComboBox.Text = "1"; band2ComboBox.Text = "2"; band3ComboBox.Text = "3"; band4ComboBox.Text = "4"; filterbankPanel.Groups = _filterbank; }