public void sample_test() { string basePath = NUnit.Framework.TestContext.CurrentContext.TestDirectory; FreeSpokenDigitsDataset fsdd = new FreeSpokenDigitsDataset(Path.Combine(basePath, "mfcc")); var mfcc = new MelFrequencyCepstrumCoefficient(); Signal a = fsdd.GetSignal(0, "jackson", 10); MelFrequencyCepstrumCoefficientDescriptor[] ra = mfcc.Transform(a).ToArray(); Assert.AreEqual(35, ra.Length); Assert.IsTrue(new double[] { 10.570020645259348d, 1.3484344242338475d, 0.4861056552885234d, -0.79287993818868352d, -0.64182784362935996d, -0.28079835895392041d, -0.46378109632237779d, 0.072039410871952647d, -0.43971730320461733d, 0.48891921252102533d, -0.22502241185050212d, 0.12478713268421229d, -0.13373400147110801d }.IsEqual(ra[0].Descriptor, 1e-8)); Signal b = fsdd.GetSignal(0, "nicolas", 10); MelFrequencyCepstrumCoefficientDescriptor[] rb = mfcc.Transform(b).ToArray(); Assert.AreEqual(24, rb.Length); Assert.IsTrue(new[] { 10.6434445230168, -0.222107787197107, 0.316067614396639, -0.212769536249701, -0.107755264262885, -0.292732772820073, -0.00445205345925395, 0.024397440969199, 0.0213769364471326, -0.0882765552657509, -0.177682484734242, -0.1013307739251, -0.099014915302743 }.IsEqual(rb[0].Descriptor, 1e-8)); Signal c = fsdd.GetSignal(5, "theo", 23); MelFrequencyCepstrumCoefficientDescriptor[] rc = mfcc.Transform(c).ToArray(); Assert.AreEqual(27, rc.Length); Assert.IsTrue(new[] { 7.24614406589037, -1.16796769512142, -0.134374026111248, -0.192703972718674, 0.112752647291759, -0.118712048308068, -0.0603752892245708, -0.0275002195634854, -0.0830858413953528, -0.0838965948140795, -0.15293502718595, 0.0107796827068413, -0.0491283773795346 }.IsEqual(rc[0].Descriptor, 1e-8)); }
public void GetEnergyTest_doc() { string basePath = Path.Combine(NUnit.Framework.TestContext.CurrentContext.TestDirectory, "energy"); #region doc_energy // Let's say we would like to compute the energy of an audio signal. For this, // we will take an example signal from the Free Spoken Digits Dataset (FSDD): FreeSpokenDigitsDataset fsdd = new FreeSpokenDigitsDataset(basePath); Signal signal = fsdd.GetSignal(digit: 3, speaker: "jackson", index: 0); // The energy is defined as the sum of squared values in all // channels of the audio signal. In this case, it should be: double energy = signal.GetEnergy(); // 19.448728048242629 #endregion Assert.AreEqual(19.448728048242629, energy, 1e-10); }
public void sample_test() { string basePath = NUnit.Framework.TestContext.CurrentContext.TestDirectory; string pathWhereTheDatasetShouldBeStored = Path.Combine(basePath, "mfcc"); #region doc_example1 // Let's say we would like to analyse an audio sample. To give an example that // could be reproduced by anyone without having to give a specific sound file // that would need to have been downloaded by every user trying to run this example, // we will use obtain an example from the Free Spoken Digits Dataset instead: var fsdd = new FreeSpokenDigitsDataset(path: pathWhereTheDatasetShouldBeStored); // Let's obtain one of the audio signals: Signal a = fsdd.GetSignal(0, "jackson", 10); int sampleRate = a.SampleRate; // 8000 // Note: if you would like to load a signal from the // disk, you could use the following method directly: // Signal a = Signal.FromFile(fileName); // Create a low-pass filter to keep only frequencies below 100 Hz var filter = new LowPassFilter(frequency: 100, sampleRate: sampleRate); // Apply the filter to the signal Signal result = filter.Apply(a); // Create a spectrogram for the original var sourceSpectrum = new Spectrogram(a); // Create a spectrogram for the filtered signal: var resultSpectrum = new Spectrogram(result); // Get the count for a high frequency before and after the low-pass filter: double before = sourceSpectrum.GetFrequencyCount(windowIndex: 0, frequency: 1000); // 0.00028203820434203334 double after = resultSpectrum.GetFrequencyCount(windowIndex: 0, frequency: 1000); // 2.9116651158267508E-05 #endregion Assert.AreEqual(0.00028203820434203334, before, 1e-8); Assert.AreEqual(2.9116651158267508E-05, after, 1e-8); }
public void sample_test() { string basePath = NUnit.Framework.TestContext.CurrentContext.TestDirectory; string pathWhereTheDatasetShouldBeStored = Path.Combine(basePath, "mfcc"); #region doc_example1 // Let's say we would like to analyse an audio sample. To give an example that // could be reproduced by anyone without having to give a specific sound file // that would need to have been downloaded by every user trying to run this example, // we will use obtain an example from the Free Spoken Digits Dataset instead: var fsdd = new FreeSpokenDigitsDataset(path: pathWhereTheDatasetShouldBeStored); // Let's obtain one of the audio signals: Signal a = fsdd.GetSignal(0, "jackson", 10); // Note: if you would like to load a signal from the // disk, you could use the following method directly: // Signal a = Signal.FromFile(fileName); // First we could extract some characteristics from the audio signal, just // for informative purposes. We don't actually need to register them just // to compute the MFCC, so please skip those checks if you would like! int numberOfChannels = a.NumberOfChannels; // should be: 1 int numberOfFrames = a.NumberOfFrames; // should be: 5451 int numberOfSamples = a.NumberOfSamples; // should be: 5451 SampleFormat format = a.SampleFormat; // should be: Format32BitIeeeFloat int sampleRate = a.SampleRate; // should be: 8000 (8khz) int samples = a.Samples; // should be: 5451 int sampleSize = a.SampleSize; // should be: 4 int numberOfBytes = a.NumberOfBytes; // should be: 21804 // Now, let's say we would like to compute its MFCC: var extractor = new MelFrequencyCepstrumCoefficient( filterCount: 40, // Note: all values are optional, you can cepstrumCount: 13, // specify only the ones you'd like and leave lowerFrequency: 133.3333, // all others at their defaults upperFrequency: 6855.4976, alpha: 0.97, samplingRate: 16000, frameRate: 100, windowLength: 0.0256, numberOfBins: 512); // We can call the transform method of the MFCC extractor class: IEnumerable <MelFrequencyCepstrumCoefficientDescriptor> mfcc = extractor.Transform(a); // or we could also transform them to a matrix directly with: double[][] actual = mfcc.Select(x => x.Descriptor).ToArray(); // This matrix would contain X different MFCC values (due the length of the signal) int numberOfMFCCs = actual.Length; // should be 35 (depends on the MFCC window) // Each of those MFCC values would have length 13; int descriptorLength = actual[0].Length; // 13 (depends on the MFCC Ceptrtum's count) // An example of an MFCC vector would have been: double[] row = actual[0]; // should have been: (see vector written below) double[] expected = new double[] { 10.570020645259348d, 1.3484344242338475d, 0.4861056552885234d, -0.79287993818868352d, -0.64182784362935996d, -0.28079835895392041d, -0.46378109632237779d, 0.072039410871952647d, -0.43971730320461733d, 0.48891921252102533d, -0.22502241185050212d, 0.12478713268421229d, -0.13373400147110801d }; #endregion Assert.AreEqual(1, numberOfChannels); Assert.AreEqual(5451, numberOfFrames); Assert.AreEqual(5451, numberOfSamples); Assert.AreEqual(SampleFormat.Format32BitIeeeFloat, format); Assert.AreEqual(8000, sampleRate); Assert.AreEqual(5451, samples); Assert.AreEqual(4, sampleSize); Assert.AreEqual(21804, numberOfBytes); Assert.AreEqual(sampleSize * numberOfFrames * numberOfChannels, numberOfBytes); Assert.AreEqual(35, numberOfMFCCs); Assert.IsTrue(expected.IsEqual(row, 1e-8)); Signal b = fsdd.GetSignal(0, "nicolas", 10); Assert.AreEqual(2, b.NumberOfChannels); Assert.AreEqual(3755, b.NumberOfFrames); Assert.AreEqual(7510, b.NumberOfSamples); Assert.AreEqual(SampleFormat.Format32BitIeeeFloat, b.SampleFormat); Assert.AreEqual(8000, b.SampleRate); Assert.AreEqual(7510, b.Samples); Assert.AreEqual(4, b.SampleSize); Assert.AreEqual(30040, b.NumberOfBytes); Assert.AreEqual(b.SampleSize * b.NumberOfFrames * b.NumberOfChannels, b.NumberOfBytes); MelFrequencyCepstrumCoefficientDescriptor[] rb = extractor.Transform(b).ToArray(); Assert.AreEqual(24, rb.Length); Assert.IsTrue(new[] { 10.6434445230168, -0.222107787197107, 0.316067614396639, -0.212769536249701, -0.107755264262885, -0.292732772820073, -0.00445205345925395, 0.024397440969199, 0.0213769364471326, -0.0882765552657509, -0.177682484734242, -0.1013307739251, -0.099014915302743 }.IsEqual(rb[0].Descriptor, 1e-8)); Signal c = fsdd.GetSignal(5, "theo", 23); Assert.AreEqual(1, c.NumberOfChannels); Assert.AreEqual(4277, c.NumberOfFrames); Assert.AreEqual(4277, c.NumberOfSamples); Assert.AreEqual(SampleFormat.Format32BitIeeeFloat, c.SampleFormat); Assert.AreEqual(8000, c.SampleRate); Assert.AreEqual(4277, c.Samples); Assert.AreEqual(4, c.SampleSize); Assert.AreEqual(17108, c.NumberOfBytes); Assert.AreEqual(b.SampleSize * c.NumberOfFrames * c.NumberOfChannels, c.NumberOfBytes); MelFrequencyCepstrumCoefficientDescriptor[] rc = extractor.Transform(c).ToArray(); Assert.AreEqual(27, rc.Length); Assert.IsTrue(new[] { 7.24614406589037, -1.16796769512142, -0.134374026111248, -0.192703972718674, 0.112752647291759, -0.118712048308068, -0.0603752892245708, -0.0275002195634854, -0.0830858413953528, -0.0838965948140795, -0.15293502718595, 0.0107796827068413, -0.0491283773795346 }.IsEqual(rc[0].Descriptor, 1e-8)); }
public void learn() { string basePath = Path.Combine(NUnit.Framework.TestContext.CurrentContext.TestDirectory, "learn"); #region doc_learn // Ensure results are reproducible Accord.Math.Random.Generator.Seed = 0; // The Bag-of-Audio-Words model converts audio signals of arbitrary // size into fixed-length feature vectors. In this example, we // will be setting the codebook size to 10. This means all feature // vectors that will be generated will have the same length of 10. // By default, the BoW object will use the MFCC extractor as the // feature extractor and K-means as the clustering algorithm. // Create a new Bag-of-Audio-Words (BoW) model var bow = BagOfAudioWords.Create(numberOfWords: 32); // Note: a simple BoW model can also be created using // var bow = new BagOfAudioWords(numberOfWords: 10); // Get some training images FreeSpokenDigitsDataset fsdd = new FreeSpokenDigitsDataset(basePath); string[] trainFileNames = fsdd.Training.LocalPaths; int[] trainOutputs = fsdd.Training.Digits; // Compute the model bow.Learn(trainFileNames); // After this point, we will be able to translate // the signals into double[] feature vectors using double[][] trainInputs = bow.Transform(trainFileNames); // We can also check some statistics about the dataset: int numberOfSignals = bow.Statistics.TotalNumberOfInstances; // 1350 // Statistics about all the descriptors that have been extracted: int totalDescriptors = bow.Statistics.TotalNumberOfDescriptors; // 29106 double totalMean = bow.Statistics.TotalNumberOfDescriptorsPerInstance.Mean; // 21.56 double totalVar = bow.Statistics.TotalNumberOfDescriptorsPerInstance.Variance; // 52.764002965159314 IntRange totalRange = bow.Statistics.TotalNumberOfDescriptorsPerInstanceRange; // [8, 115] // Statistics only about the descriptors that have been actually used: int takenDescriptors = bow.Statistics.NumberOfDescriptorsTaken; // 29106 double takenMean = bow.Statistics.NumberOfDescriptorsTakenPerInstance.Mean; // 21.56 double takenVar = bow.Statistics.NumberOfDescriptorsTakenPerInstance.Variance; // 52.764002965159314 IntRange takenRange = bow.Statistics.NumberOfDescriptorsTakenPerInstanceRange; // [8, 115] #endregion Assert.AreEqual(1350, numberOfSignals); Assert.AreEqual(29106, totalDescriptors); Assert.AreEqual(21.56, totalMean); Assert.AreEqual(52.764002965159314, totalVar, 1e-8); Assert.AreEqual(new IntRange(8, 115), totalRange); Assert.AreEqual(29106, takenDescriptors); Assert.AreEqual(21.56, takenMean); Assert.AreEqual(52.764002965159314, takenVar, 1e-8); Assert.AreEqual(new IntRange(8, 115), takenRange); var kmeans = bow.Clustering as KMeans; Assert.AreEqual(13, kmeans.Clusters.NumberOfInputs); Assert.AreEqual(32, kmeans.Clusters.NumberOfOutputs); Assert.AreEqual(32, kmeans.Clusters.NumberOfClasses); #region doc_classification // Now, the features can be used to train any classification // algorithm as if they were the signals themselves. For example, // we can use them to train an Chi-square SVM as shown below: // Create the SMO algorithm to learn a Chi-Square kernel SVM var teacher = new MulticlassSupportVectorLearning <ChiSquare>() { Learner = (p) => new SequentialMinimalOptimization <ChiSquare>() }; // Obtain a learned machine var svm = teacher.Learn(trainInputs, trainOutputs); // Use the machine to classify the features int[] output = svm.Decide(trainInputs); // Compute the error between the expected and predicted labels for the training set: var trainMetrics = GeneralConfusionMatrix.Estimate(svm, trainInputs, trainOutputs); double trainAcc = trainMetrics.Accuracy; // should be around 0.97259259259259256 // Now, we can evaluate the performance of the model on the testing set: string[] testFileNames = fsdd.Testing.LocalPaths; int[] testOutputs = fsdd.Testing.Digits; // First we transform the testing set to double[]: double[][] testInputs = bow.Transform(testFileNames); // Then we compute the error between expected and predicted for the testing set: var testMetrics = GeneralConfusionMatrix.Estimate(svm, testInputs, testOutputs); double testAcc = testMetrics.Accuracy; // should be around 0.8666666666666667 #endregion Assert.AreEqual(0.97259259259259256, trainAcc, 1e-8); Assert.AreEqual(0.8666666666666667, testAcc, 1e-8); }