public static void TestOctaveScale(FreqScaleType fst) { var freqScale = new FrequencyScale(fst); var octaveBinBounds = freqScale.BinBounds; // now test the octave scale using a test spectrum int sr = 22050; int frameSize = 8192; // default for sr = 22050 if (fst == FreqScaleType.Octaves24Nyquist32000 || fst == FreqScaleType.Linear125Octaves7Tones28Nyquist32000) { sr = 64000; frameSize = 16384; // default for sr = 64000 } // Get a simple test spectrum var linearSpectrum = GetSimpleTestSpectrum(sr, frameSize); //do the test var octaveSpectrum = OctaveSpectrum(octaveBinBounds, linearSpectrum); // write output int rowCount = octaveBinBounds.GetLength(0); for (int i = 0; i < rowCount; i++) { Console.WriteLine(i + " bin-" + octaveBinBounds[i, 0] + " " + octaveBinBounds[i, 1] + "Hz " + octaveSpectrum[i]); } }
/// <summary> /// Initializes a new instance of the <see cref="FrequencyScale"/> class. /// CONSTRUCTOR /// </summary> public FrequencyScale(FreqScaleType fst) { this.ScaleType = fst; if (fst == FreqScaleType.Linear) { LoggedConsole.WriteErrorLine("WARNING: Assigning DEFAULT parameters for Linear FREQUENCY SCALE."); LoggedConsole.WriteErrorLine(" Call other CONSTUCTOR to control linear scale."); this.Nyquist = 11025; this.WindowSize = 512; this.FinalBinCount = 256; this.HertzGridInterval = 1000; this.LinearBound = this.Nyquist; this.BinBounds = this.GetLinearBinBounds(); this.GridLineLocations = GetLinearGridLineLocations(this.Nyquist, this.HertzGridInterval, 256); } else if (fst == FreqScaleType.Mel) { LoggedConsole.WriteErrorLine("WARNING: Assigning DEFAULT parameters for MEL FREQUENCY SCALE."); this.Nyquist = 11025; this.WindowSize = 512; this.FinalBinCount = 128; this.HertzGridInterval = 1000; this.LinearBound = this.Nyquist; this.GridLineLocations = GetMelGridLineLocations(this.HertzGridInterval, this.Nyquist, this.FinalBinCount); } else { // assume octave scale is only other option OctaveFreqScale.GetOctaveScale(this); } }
/// <summary> /// METHOD TO CHECK IF SPECIFIED MEL FREQ SCALE IS WORKING /// Check it on standard one minute recording. /// </summary> public static void TESTMETHOD_MelFrequencyScale() { var recordingPath = @"C:\SensorNetworks\SoftwareTests\TestRecordings\BAC2_20071008-085040.wav"; var outputDir = @"C:\SensorNetworks\SoftwareTests\TestFrequencyScale".ToDirectoryInfo(); var expectedResultsDir = Path.Combine(outputDir.FullName, TestTools.ExpectedResultsDir).ToDirectoryInfo(); var outputImagePath = Path.Combine(outputDir.FullName, "melScaleSonogram.png"); var opFileStem = "BAC2_20071008"; var recording = new AudioRecording(recordingPath); int nyquist = recording.Nyquist; int frameSize = 1024; int finalBinCount = 256; int hertzInterval = 1000; FreqScaleType scaleType = FreqScaleType.Mel; var freqScale = new FrequencyScale(scaleType, nyquist, frameSize, finalBinCount, hertzInterval); var fst = freqScale.ScaleType; var sonoConfig = new SonogramConfig { WindowSize = frameSize, WindowOverlap = 0.2, SourceFName = recording.BaseName, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, //NoiseReductionType = NoiseReductionType.Standard, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); // DRAW SPECTROGRAM var image = sonogram.GetImageFullyAnnotated(sonogram.GetImage(), "SPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); image.Save(outputImagePath); // DO FILE EQUALITY TEST string testName = "MelTest"; var expectedTestFile = new FileInfo(Path.Combine(expectedResultsDir.FullName, "MelFrequencyScaleTest.EXPECTED.json")); var resultFile = new FileInfo(Path.Combine(outputDir.FullName, opFileStem + "MelFrequencyLinearScaleTestResults.json")); Acoustics.Shared.Csv.Csv.WriteMatrixToCsv(resultFile, freqScale.GridLineLocations); TestTools.FileEqualityTest(testName, resultFile, expectedTestFile); LoggedConsole.WriteLine("Completed Mel Frequency Scale test"); Console.WriteLine("\n\n"); }
/// <summary> /// Initializes a new instance of the <see cref="FrequencyScale"/> class. /// CONSTRUCTOR /// Calling this constructor assumes either Linear or Mel is required but not Octave /// </summary> public FrequencyScale(FreqScaleType type, int nyquist, int frameSize, int finalBinCount, int hertzGridInterval) { this.ScaleType = type; this.Nyquist = nyquist; this.WindowSize = frameSize; this.FinalBinCount = finalBinCount; this.HertzGridInterval = hertzGridInterval; if (type == FreqScaleType.Mel) { this.BinBounds = this.GetMelBinBounds(); this.GridLineLocations = GetMelGridLineLocations(this.HertzGridInterval, nyquist, this.FinalBinCount); this.LinearBound = 1000; } else { // linear is the default this.BinBounds = this.GetLinearBinBounds(); this.GridLineLocations = GetLinearGridLineLocations(nyquist, this.HertzGridInterval, this.FinalBinCount); this.LinearBound = nyquist; } }
public static void GenerateSpectrograms() { var recordingDir = @"M:\Liz\SupervisedPatchSamplingSet\Recordings\"; var resultDir = @"M:\Liz\SupervisedPatchSamplingSet\"; // check whether there is any file in the folder/subfolders if (Directory.GetFiles(recordingDir, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } int frameSize = 1024; int finalBinCount = 256; FreqScaleType scaleType = FreqScaleType.Mel; var settings = new SpectrogramSettings() { WindowSize = frameSize, // the duration of each frame (according to the default value (i.e., 1024) of frame size) is 0.04644 seconds // The question is how many single-frames (i.e., patch height is equal to 1) should be selected to form one second // The "WindowOverlap" is calculated to answer this question // each 24 single-frames duration is equal to 1 second // note that the "WindowOverlap" value should be recalculated if frame size is changed // this has not yet been considered in the Config file! WindowOverlap = 0.10725204, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; foreach (string filePath in Directory.GetFiles(recordingDir, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); settings.SourceFileName = recording.BaseName; var amplitudeSpectrogram = new AmplitudeSpectrogram(settings, recording.WavReader); var decibelSpectrogram = new DecibelSpectrogram(amplitudeSpectrogram); // DO NOISE REDUCTION decibelSpectrogram.Data = PcaWhitening.NoiseReduction(decibelSpectrogram.Data); // draw the spectrogram var attributes = new SpectrogramAttributes() { NyquistFrequency = decibelSpectrogram.Attributes.NyquistFrequency, Duration = decibelSpectrogram.Attributes.Duration, }; Image image = DecibelSpectrogram.DrawSpectrogramAnnotated(decibelSpectrogram.Data, settings, attributes); string pathToSpectrogramFiles = Path.Combine(resultDir, "Spectrograms", settings.SourceFileName + ".bmp"); image.Save(pathToSpectrogramFiles); // write the matrix to a csv file string pathToMatrixFiles = Path.Combine(resultDir, "Matrices", settings.SourceFileName + ".csv"); Csv.WriteMatrixToCsv(pathToMatrixFiles.ToFileInfo(), decibelSpectrogram.Data); } } }
public void LocalSpectralPeakTest() { var configPath = @"SpectralPeakTrackingConfig.yml"; var recordingPath = @"SM27 22 Sep 2018 3.30 am.wav"; var imagePath = @"image_whistle_peaks_1500_3500_100_250_6.bmp"; //var trackImagePath = @"trackImage.bmp"; var pathToCsvFile = @"PeakTrackInfo_SM27 22 Sep 2018 3.30 am.csv"; var configFile = configPath.ToFileInfo(); if (configFile == null) { throw new FileNotFoundException("No config file argument provided"); } else if (!configFile.Exists) { throw new ArgumentException($"Config file {configFile.FullName} not found"); } var configuration = ConfigFile.Deserialize <SpectralPeakTrackingConfig>(configFile); var recording = new AudioRecording(recordingPath); // get the nyquist value from the recording int nyquist = new AudioRecording(recordingPath).Nyquist; int frameSize = configuration.FrameWidth; double frameOverlap = configuration.FrameOverlap; int finalBinCount = 512; var hertzPerFreqBin = nyquist / finalBinCount; FreqScaleType scaleType = FreqScaleType.Linear; var spectrogramSettings = new SpectrogramSettings() { WindowSize = frameSize, WindowOverlap = frameOverlap, //DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, //MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, }; var sonoConfig = new SonogramConfig() { WindowSize = frameSize, WindowOverlap = frameOverlap, //DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, //MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, }; var frameStep = frameSize * (1 - frameOverlap); var secondsPerFrame = frameStep / (nyquist * 2); //var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); var amplitudeSpectrogram = new AmplitudeSpectrogram(spectrogramSettings, recording.WavReader); var energySpectrogram = new EnergySpectrogram(amplitudeSpectrogram); var decibelSpectrogram = new SpectrogramStandard(sonoConfig, recording.WavReader); // Noise Reduction //var noiseReducedSpectrogram = SNR.NoiseReduce_Standard(energySpectrogram.Data); var output = SpectralPeakTracking2018.SpectralPeakTracking(energySpectrogram.Data, configuration.SptSettings, hertzPerFreqBin, secondsPerFrame); // draw the local peaks double[,] hits = SpectralPeakTracking2018.MakeHitMatrix(energySpectrogram.Data, output.TargetPeakBinsIndex, output.BandIndex); var image = SpectralPeakTracking2018.DrawSonogram(decibelSpectrogram, hits); image.Save(imagePath); string[] header = new[] { "Frame No", "Start Time", "Bin No", "Freq", "Score", "Detection" }; var csv = new StringBuilder(); string content = string.Empty; foreach (var entry in header.ToArray()) { content += entry.ToString() + ","; } csv.AppendLine(content); foreach (var entry in output.peakTrackInfoList) { content = string.Empty; foreach (var value in entry) { content += value.ToString() + ","; } csv.AppendLine(content); } File.WriteAllText(pathToCsvFile, csv.ToString()); //Csv.WriteMatrixToCsv(pathToCsvFile.ToFileInfo(), output.peakTrackInfoList); // draw spectral tracks //var trackImage = SpectralPeakTracking2018.DrawTracks(decibelSpectrogram, hits, output.SpecTracks); //trackImage.Save(trackImagePath, ImageFormat.Bmp); }
/// <summary> /// Apply feature learning process on a set of target (1-minute) recordings (inputPath) /// according to the a set of centroids learned using feature learning process. /// Output feature vectors (outputPath). /// </summary> public static void UnsupervisedFeatureExtraction(FeatureLearningSettings config, List <double[][]> allCentroids, string inputPath, string outputPath) { var simVecDir = Directory.CreateDirectory(Path.Combine(outputPath, "SimilarityVectors")); int frameSize = config.FrameSize; int finalBinCount = config.FinalBinCount; FreqScaleType scaleType = config.FrequencyScaleType; var settings = new SpectrogramSettings() { WindowSize = frameSize, // the duration of each frame (according to the default value (i.e., 1024) of frame size) is 0.04644 seconds // The question is how many single-frames (i.e., patch height is equal to 1) should be selected to form one second // The "WindowOverlap" is calculated to answer this question // each 24 single-frames duration is equal to 1 second // note that the "WindowOverlap" value should be recalculated if frame size is changed // this has not yet been considered in the Config file! WindowOverlap = 0.10725204, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; double frameStep = frameSize * (1 - settings.WindowOverlap); int minFreqBin = config.MinFreqBin; int maxFreqBin = config.MaxFreqBin; int numFreqBand = config.NumFreqBand; int patchWidth = (maxFreqBin - minFreqBin + 1) / numFreqBand; int patchHeight = config.PatchHeight; // the number of frames that their feature vectors will be concatenated in order to preserve temporal information. int frameWindowLength = config.FrameWindowLength; // the step size to make a window of frames int stepSize = config.StepSize; // the factor of downsampling int maxPoolingFactor = config.MaxPoolingFactor; // check whether there is any file in the folder/subfolders if (Directory.GetFiles(inputPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } //***** // lists of features for all processing files // the key is the file name, and the value is the features for different bands Dictionary <string, List <double[, ]> > allFilesMinFeatureVectors = new Dictionary <string, List <double[, ]> >(); Dictionary <string, List <double[, ]> > allFilesMeanFeatureVectors = new Dictionary <string, List <double[, ]> >(); Dictionary <string, List <double[, ]> > allFilesMaxFeatureVectors = new Dictionary <string, List <double[, ]> >(); Dictionary <string, List <double[, ]> > allFilesStdFeatureVectors = new Dictionary <string, List <double[, ]> >(); Dictionary <string, List <double[, ]> > allFilesSkewnessFeatureVectors = new Dictionary <string, List <double[, ]> >(); double[,] inputMatrix; List <AudioRecording> recordings = new List <AudioRecording>(); foreach (string filePath in Directory.GetFiles(inputPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); settings.SourceFileName = recording.BaseName; if (config.DoSegmentation) { recordings = PatchSampling.GetSubsegmentsSamples(recording, config.SubsegmentDurationInSeconds, frameStep); } else { recordings.Add(recording); } for (int s = 0; s < recordings.Count; s++) { string pathToSimilarityVectorsFile = Path.Combine(simVecDir.FullName, fileInfo.Name + "-" + s.ToString() + ".csv"); var amplitudeSpectrogram = new AmplitudeSpectrogram(settings, recordings[s].WavReader); var decibelSpectrogram = new DecibelSpectrogram(amplitudeSpectrogram); // DO RMS NORMALIZATION //sonogram.Data = SNR.RmsNormalization(sonogram.Data); // DO NOISE REDUCTION if (config.DoNoiseReduction) { decibelSpectrogram.Data = PcaWhitening.NoiseReduction(decibelSpectrogram.Data); } // check whether the full band spectrogram is needed or a matrix with arbitrary freq bins if (minFreqBin != 1 || maxFreqBin != finalBinCount) { inputMatrix = PatchSampling.GetArbitraryFreqBandMatrix(decibelSpectrogram.Data, minFreqBin, maxFreqBin); } else { inputMatrix = decibelSpectrogram.Data; } // creating matrices from different freq bands of the source spectrogram List <double[, ]> allSubmatrices2 = PatchSampling.GetFreqBandMatrices(inputMatrix, numFreqBand); double[][,] matrices2 = allSubmatrices2.ToArray(); List <double[, ]> allSequentialPatchMatrix = new List <double[, ]>(); for (int i = 0; i < matrices2.GetLength(0); i++) { // downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling double[,] downsampledMatrix = FeatureLearning.MaxPooling(matrices2[i], config.MaxPoolingFactor); int rows = downsampledMatrix.GetLength(0); int columns = downsampledMatrix.GetLength(1); var sequentialPatches = PatchSampling.GetPatches(downsampledMatrix, patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential); allSequentialPatchMatrix.Add(sequentialPatches.ToMatrix()); } // +++++++++++++++++++++++++++++++++++Feature Transformation // to do the feature transformation, we normalize centroids and // sequential patches from the input spectrogram to unit length // Then, we calculate the dot product of each patch with the centroids' matrix List <double[][]> allNormCentroids = new List <double[][]>(); for (int i = 0; i < allCentroids.Count; i++) { // double check the index of the list double[][] normCentroids = new double[allCentroids.ToArray()[i].GetLength(0)][]; for (int j = 0; j < allCentroids.ToArray()[i].GetLength(0); j++) { normCentroids[j] = ART_2A.NormaliseVector(allCentroids.ToArray()[i][j]); } allNormCentroids.Add(normCentroids); } List <double[][]> allFeatureTransVectors = new List <double[][]>(); // processing the sequential patch matrix for each band for (int i = 0; i < allSequentialPatchMatrix.Count; i++) { List <double[]> featureTransVectors = new List <double[]>(); double[][] similarityVectors = new double[allSequentialPatchMatrix.ToArray()[i].GetLength(0)][]; for (int j = 0; j < allSequentialPatchMatrix.ToArray()[i].GetLength(0); j++) { // normalize each patch to unit length var inputVector = allSequentialPatchMatrix.ToArray()[i].ToJagged()[j]; var normVector = inputVector; // to avoid vectors with NaN values, only normalize those that their norm is not equal to zero. if (inputVector.Euclidean() != 0) { normVector = ART_2A.NormaliseVector(inputVector); } similarityVectors[j] = allNormCentroids.ToArray()[i].ToMatrix().Dot(normVector); } Csv.WriteMatrixToCsv(pathToSimilarityVectorsFile.ToFileInfo(), similarityVectors.ToMatrix()); // To preserve the temporal information, we can concatenate the similarity vectors of a group of frames // using FrameWindowLength // patchId refers to the patch id that has been processed so far according to the step size. // if we want no overlap between different frame windows, then stepSize = frameWindowLength int patchId = 0; while (patchId + frameWindowLength - 1 < similarityVectors.GetLength(0)) { List <double[]> patchGroup = new List <double[]>(); for (int k = 0; k < frameWindowLength; k++) { patchGroup.Add(similarityVectors[k + patchId]); } featureTransVectors.Add(DataTools.ConcatenateVectors(patchGroup)); patchId = patchId + stepSize; } allFeatureTransVectors.Add(featureTransVectors.ToArray()); } // +++++++++++++++++++++++++++++++++++Feature Transformation // +++++++++++++++++++++++++++++++++++Temporal Summarization // Based on the resolution to generate features, the "numFrames" parameter will be set. // Each 24 single-frame patches form 1 second // for each 24 patch, we generate 5 vectors of min, mean, std, and max (plus skewness from Accord.net) // The pre-assumption is that each input recording is 1 minute long // store features of different bands in lists List <double[, ]> allMinFeatureVectors = new List <double[, ]>(); List <double[, ]> allMeanFeatureVectors = new List <double[, ]>(); List <double[, ]> allMaxFeatureVectors = new List <double[, ]>(); List <double[, ]> allStdFeatureVectors = new List <double[, ]>(); List <double[, ]> allSkewnessFeatureVectors = new List <double[, ]>(); // Each 24 frames form 1 second using WindowOverlap // factors such as stepSize, and maxPoolingFactor should be considered in temporal summarization. int numFrames = 24 / (patchHeight * stepSize * maxPoolingFactor); foreach (var freqBandFeature in allFeatureTransVectors) { List <double[]> minFeatureVectors = new List <double[]>(); List <double[]> meanFeatureVectors = new List <double[]>(); List <double[]> maxFeatureVectors = new List <double[]>(); List <double[]> stdFeatureVectors = new List <double[]>(); List <double[]> skewnessFeatureVectors = new List <double[]>(); int c = 0; while (c + numFrames <= freqBandFeature.GetLength(0)) { // First, make a list of patches that would be equal to the needed resolution (1 second, 60 second, etc.) List <double[]> sequencesOfFramesList = new List <double[]>(); for (int i = c; i < c + numFrames; i++) { sequencesOfFramesList.Add(freqBandFeature[i]); } List <double> min = new List <double>(); List <double> mean = new List <double>(); List <double> std = new List <double>(); List <double> max = new List <double>(); List <double> skewness = new List <double>(); double[,] sequencesOfFrames = sequencesOfFramesList.ToArray().ToMatrix(); // Second, calculate mean, max, and standard deviation (plus skewness) of vectors element-wise for (int j = 0; j < sequencesOfFrames.GetLength(1); j++) { double[] temp = new double[sequencesOfFrames.GetLength(0)]; for (int k = 0; k < sequencesOfFrames.GetLength(0); k++) { temp[k] = sequencesOfFrames[k, j]; } min.Add(temp.GetMinValue()); mean.Add(AutoAndCrossCorrelation.GetAverage(temp)); std.Add(AutoAndCrossCorrelation.GetStdev(temp)); max.Add(temp.GetMaxValue()); skewness.Add(temp.Skewness()); } minFeatureVectors.Add(min.ToArray()); meanFeatureVectors.Add(mean.ToArray()); maxFeatureVectors.Add(max.ToArray()); stdFeatureVectors.Add(std.ToArray()); skewnessFeatureVectors.Add(skewness.ToArray()); c += numFrames; } // when (freqBandFeature.GetLength(0) % numFrames) != 0, it means there are a number of frames (< numFrames) // (or the whole) at the end of the target recording , left unprocessed. // this would be problematic when an the resolution to generate the feature vector is 1 min, // but the the length of the target recording is a bit less than one min. if (freqBandFeature.GetLength(0) % numFrames != 0 && freqBandFeature.GetLength(0) % numFrames > 1) { // First, make a list of patches that would be less than the required resolution List <double[]> sequencesOfFramesList = new List <double[]>(); int unprocessedFrames = freqBandFeature.GetLength(0) % numFrames; for (int i = freqBandFeature.GetLength(0) - unprocessedFrames; i < freqBandFeature.GetLength(0); i++) { sequencesOfFramesList.Add(freqBandFeature[i]); } List <double> min = new List <double>(); List <double> mean = new List <double>(); List <double> std = new List <double>(); List <double> max = new List <double>(); List <double> skewness = new List <double>(); double[,] sequencesOfFrames = sequencesOfFramesList.ToArray().ToMatrix(); // Second, calculate mean, max, and standard deviation (plus skewness) of vectors element-wise for (int j = 0; j < sequencesOfFrames.GetLength(1); j++) { double[] temp = new double[sequencesOfFrames.GetLength(0)]; for (int k = 0; k < sequencesOfFrames.GetLength(0); k++) { temp[k] = sequencesOfFrames[k, j]; } min.Add(temp.GetMinValue()); mean.Add(AutoAndCrossCorrelation.GetAverage(temp)); std.Add(AutoAndCrossCorrelation.GetStdev(temp)); max.Add(temp.GetMaxValue()); skewness.Add(temp.Skewness()); } minFeatureVectors.Add(min.ToArray()); meanFeatureVectors.Add(mean.ToArray()); maxFeatureVectors.Add(max.ToArray()); stdFeatureVectors.Add(std.ToArray()); skewnessFeatureVectors.Add(skewness.ToArray()); } allMinFeatureVectors.Add(minFeatureVectors.ToArray().ToMatrix()); allMeanFeatureVectors.Add(meanFeatureVectors.ToArray().ToMatrix()); allMaxFeatureVectors.Add(maxFeatureVectors.ToArray().ToMatrix()); allStdFeatureVectors.Add(stdFeatureVectors.ToArray().ToMatrix()); allSkewnessFeatureVectors.Add(skewnessFeatureVectors.ToArray().ToMatrix()); } //***** // the keys of the following dictionaries contain file name // and their values are a list<double[,]> which the list.count is // the number of all subsegments for which features are extracted // the number of freq bands defined as an user-defined parameter. // the 2D-array is the feature vectors. allFilesMinFeatureVectors.Add(fileInfo.Name + "-" + s.ToString(), allMinFeatureVectors); allFilesMeanFeatureVectors.Add(fileInfo.Name + "-" + s.ToString(), allMeanFeatureVectors); allFilesMaxFeatureVectors.Add(fileInfo.Name + "-" + s.ToString(), allMaxFeatureVectors); allFilesStdFeatureVectors.Add(fileInfo.Name + "-" + s.ToString(), allStdFeatureVectors); allFilesSkewnessFeatureVectors.Add(fileInfo.Name + "-" + s.ToString(), allSkewnessFeatureVectors); // +++++++++++++++++++++++++++++++++++Temporal Summarization } } } // ++++++++++++++++++++++++++++++++++Writing features to one file // First, concatenate mean, max, std for each second. // Then, write the features of each pre-defined frequency band into a separate CSV file. var filesName = allFilesMeanFeatureVectors.Keys.ToArray(); var minFeatures = allFilesMinFeatureVectors.Values.ToArray(); var meanFeatures = allFilesMeanFeatureVectors.Values.ToArray(); var maxFeatures = allFilesMaxFeatureVectors.Values.ToArray(); var stdFeatures = allFilesStdFeatureVectors.Values.ToArray(); var skewnessFeatures = allFilesSkewnessFeatureVectors.Values.ToArray(); // The number of elements in the list shows the number of freq bands // the size of each element in the list shows the number of files processed to generate feature for. // the dimensions of the matrix shows the number of feature vectors generated for each file and the length of feature vector var allMins = new List <double[][, ]>(); var allMeans = new List <double[][, ]>(); var allMaxs = new List <double[][, ]>(); var allStds = new List <double[][, ]>(); var allSkewness = new List <double[][, ]>(); // looping over freq bands for (int i = 0; i < meanFeatures[0].Count; i++) { var mins = new List <double[, ]>(); var means = new List <double[, ]>(); var maxs = new List <double[, ]>(); var stds = new List <double[, ]>(); var skewnesses = new List <double[, ]>(); // looping over all files for (int k = 0; k < meanFeatures.Length; k++) { mins.Add(minFeatures[k].ToArray()[i]); means.Add(meanFeatures[k].ToArray()[i]); maxs.Add(maxFeatures[k].ToArray()[i]); stds.Add(stdFeatures[k].ToArray()[i]); skewnesses.Add(skewnessFeatures[k].ToArray()[i]); } allMins.Add(mins.ToArray()); allMeans.Add(means.ToArray()); allMaxs.Add(maxs.ToArray()); allStds.Add(stds.ToArray()); allSkewness.Add(skewnesses.ToArray()); } // each element of meanFeatures array is a list of features for different frequency bands. // looping over the number of freq bands for (int i = 0; i < allMeans.ToArray().GetLength(0); i++) { // creating output feature file based on the number of freq bands var outputFeatureFile = Path.Combine(outputPath, "FeatureVectors-" + i.ToString() + ".csv"); // creating the header for CSV file List <string> header = new List <string>(); header.Add("file name"); for (int j = 0; j < allMins.ToArray()[i][0].GetLength(1); j++) { header.Add("min" + j.ToString()); } for (int j = 0; j < allMeans.ToArray()[i][0].GetLength(1); j++) { header.Add("mean" + j.ToString()); } for (int j = 0; j < allMaxs.ToArray()[i][0].GetLength(1); j++) { header.Add("max" + j.ToString()); } for (int j = 0; j < allStds.ToArray()[i][0].GetLength(1); j++) { header.Add("std" + j.ToString()); } for (int j = 0; j < allSkewness.ToArray()[i][0].GetLength(1); j++) { header.Add("skewness" + j.ToString()); } var csv = new StringBuilder(); string content = string.Empty; foreach (var entry in header.ToArray()) { content += entry.ToString() + ","; } csv.AppendLine(content); var allFilesFeatureVectors = new Dictionary <string, double[, ]>(); // looping over files for (int j = 0; j < allMeans.ToArray()[i].GetLength(0); j++) { // concatenating mean, std, and max vector together for the pre-defined resolution List <double[]> featureVectors = new List <double[]>(); for (int k = 0; k < allMeans.ToArray()[i][j].ToJagged().GetLength(0); k++) { List <double[]> featureList = new List <double[]> { allMins.ToArray()[i][j].ToJagged()[k], allMeans.ToArray()[i][j].ToJagged()[k], allMaxs.ToArray()[i][j].ToJagged()[k], allStds.ToArray()[i][j].ToJagged()[k], allSkewness.ToArray()[i][j].ToJagged()[k], }; double[] featureVector = DataTools.ConcatenateVectors(featureList); featureVectors.Add(featureVector); } allFilesFeatureVectors.Add(filesName[j], featureVectors.ToArray().ToMatrix()); } // writing feature vectors to CSV file foreach (var entry in allFilesFeatureVectors) { content = string.Empty; content += entry.Key.ToString() + ","; foreach (var cent in entry.Value) { content += cent.ToString() + ","; } csv.AppendLine(content); } File.WriteAllText(outputFeatureFile, csv.ToString()); } }
/// <summary> /// This method takes an audio recording and returns an octave scale spectrogram. /// At the present time it only works for recordings with 64000 sample rate and returns a 256 bin sonogram. /// TODO: generalise this method for other recordings and octave scales. /// </summary> public static BaseSonogram ConvertRecordingToOctaveScaleSonogram(AudioRecording recording, FreqScaleType fst) { var freqScale = new FrequencyScale(fst); double windowOverlap = 0.75; var sonoConfig = new SonogramConfig { WindowSize = freqScale.WindowSize, WindowOverlap = windowOverlap, SourceFName = recording.BaseName, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; // Generate amplitude sonogram and then conver to octave scale var sonogram = new AmplitudeSonogram(sonoConfig, recording.WavReader); // THIS IS THE CRITICAL LINE. // TODO: SHOULD DEVELOP A SEPARATE UNIT TEST for this method sonogram.Data = ConvertAmplitudeSpectrogramToDecibelOctaveScale(sonogram.Data, freqScale); // DO NOISE REDUCTION var dataMatrix = SNR.NoiseReduce_Standard(sonogram.Data); sonogram.Data = dataMatrix; int windowSize = freqScale.FinalBinCount * 2; sonogram.Configuration.WindowSize = windowSize; sonogram.Configuration.WindowStep = (int)Math.Round(windowSize * (1 - windowOverlap)); return(sonogram); }
public static int[,] GetGridLineLocations(FreqScaleType ost, int[,] octaveBinBounds) { int[,] gridLineLocations = null; switch (ost) { case FreqScaleType.Linear62Octaves7Tones31Nyquist11025: gridLineLocations = new int[8, 2]; LoggedConsole.WriteErrorLine("This Octave Scale does not currently have grid data provided."); break; case FreqScaleType.Linear125Octaves6Tones30Nyquist11025: gridLineLocations = new int[7, 2]; gridLineLocations[0, 0] = 46; // 125 Hz gridLineLocations[1, 0] = 79; // 250 gridLineLocations[2, 0] = 111; // 500 gridLineLocations[3, 0] = 143; // 1000 gridLineLocations[4, 0] = 175; // 2000 gridLineLocations[5, 0] = 207; // 4000 gridLineLocations[6, 0] = 239; // 8000 // enter the Hz value gridLineLocations[0, 1] = 125; // 125 Hz gridLineLocations[1, 1] = 250; // 250 gridLineLocations[2, 1] = 500; // 500 gridLineLocations[3, 1] = 1000; // 1000 gridLineLocations[4, 1] = 2000; // 2000 gridLineLocations[5, 1] = 4000; // 4000 gridLineLocations[6, 1] = 8000; // 8000 break; case FreqScaleType.Octaves24Nyquist32000: gridLineLocations = new int[8, 2]; LoggedConsole.WriteErrorLine("This Octave Scale does not currently have grid data provided."); break; case FreqScaleType.Linear125Octaves7Tones28Nyquist32000: gridLineLocations = new int[9, 2]; gridLineLocations[0, 0] = 34; // 125 Hz gridLineLocations[1, 0] = 62; // 250 gridLineLocations[2, 0] = 89; // 500 gridLineLocations[3, 0] = 117; // 1000 gridLineLocations[4, 0] = 145; // 2000 gridLineLocations[5, 0] = 173; // 4000 gridLineLocations[6, 0] = 201; // 8000 gridLineLocations[7, 0] = 229; //16000 gridLineLocations[8, 0] = 256; //32000 // enter the Hz values gridLineLocations[0, 1] = 125; // 125 Hz gridLineLocations[1, 1] = 250; // 250 gridLineLocations[2, 1] = 500; // 500 gridLineLocations[3, 1] = 1000; // 1000 gridLineLocations[4, 1] = 2000; // 2000 gridLineLocations[5, 1] = 4000; // 4000 gridLineLocations[6, 1] = 8000; // 8000 gridLineLocations[7, 1] = 16000; //16000 gridLineLocations[8, 1] = 32000; //32000 break; default: LoggedConsole.WriteErrorLine("Not a valid Octave Scale."); break; } return(gridLineLocations); }
public void PowerSpectrumDensityTest() { var inputPath = @"C:\Users\kholghim\Mahnoosh\Liz\TrainSet\"; var resultPsdPath = @"C:\Users\kholghim\Mahnoosh\Liz\PowerSpectrumDensity\train_LogPSD.bmp"; var resultNoiseReducedPsdPath = @"C:\Users\kholghim\Mahnoosh\Liz\PowerSpectrumDensity\train_LogPSD_NoiseReduced.bmp"; //var inputPath =Path.Combine(inputDir, "TrainSet"); // directory of the one-min recordings of one day (21 and 23 Apr - Black Rail Data) // check whether there is any file in the folder/subfolders if (Directory.GetFiles(inputPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } // get the nyquist value from the first wav file in the folder of recordings int nq = new AudioRecording(Directory.GetFiles(inputPath, "*.wav")[0]).Nyquist; int nyquist = nq; // 11025; int frameSize = 1024; int finalBinCount = 512; //256; // int hertzInterval = 1000; FreqScaleType scaleType = FreqScaleType.Linear; //var freqScale = new FrequencyScale(scaleType, nyquist, frameSize, finalBinCount, hertzInterval); //var fst = freqScale.ScaleType; //var fst = FreqScaleType.Linear; //var freqScale = new FrequencyScale(fst); var settings = new SpectrogramSettings() { WindowSize = frameSize, WindowOverlap = 0.1028, //DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, //MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, //DoMelScale = false, MelBinCount = 256, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, //MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; var attributes = new SpectrogramAttributes() { NyquistFrequency = nyquist, Duration = TimeSpan.FromMinutes(1440), }; List <double[]> psd = new List <double[]>(); foreach (string filePath in Directory.GetFiles(inputPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); //var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); //var amplitudeSpectrogram = new AmplitudeSonogram(sonoConfig, recording.WavReader); // save the matrix // skip normalisation // skip mel settings.SourceFileName = recording.BaseName; var spectrogram = new EnergySpectrogram(settings, recording.WavReader); //var sonogram = new AmplitudeSpectrogram(settings, recording.WavReader); //var energySpectrogram = new EnergySpectrogram(sonoConfig, amplitudeSpectrogram.Data); //var energySpectrogram = new EnergySpectrogram(sonoConfig, recording.WavReader); //var energySpectrogram = new EnergySpectrogram(settings, recording.WavReader); // square the FFT coefficients to get an energy spectrogram // double[,] energySpectrogram = PowerSpectrumDensity.GetEnergyValues(amplitudeSpectrogram.Data); // RMS NORMALIZATION //double[,] normalizedValues = SNR.RmsNormalization(energySpectro.Data); //energySpectro.Data = SNR.RmsNormalization(energySpectro.Data); // Median Noise Reduction //spectrogram.Data = PcaWhitening.NoiseReduction(spectrogram.Data); //spectrogram.Data = SNR.NoiseReduce_Standard(spectrogram.Data); //double[] psd = PowerSpectralDensity.GetPowerSpectrum(noiseReducedValues); //psd.Add(energySpectro.GetLogPsd()); psd.Add(MatrixTools.GetColumnAverages(spectrogram.Data)); //psd.Add(SpectrogramTools.CalculateAvgSpectrumFromEnergySpectrogram(normalizedValues)); //psd.Add(PowerSpectralDensity.GetPowerSpectrum(normalizedValues)); } } // writing psd matrix to csv file //Csv.WriteMatrixToCsv(new FileInfo(@"C:\Users\kholghim\Mahnoosh\Liz\PowerSpectrumDensity\psd.csv"), psd.ToArray().ToMatrix()); //Image imagePsd = DecibelSpectrogram.DrawSpectrogramAnnotated(psd.ToArray().ToMatrix(), settings, attributes); //imagePsd.Save(resultPsdPath, ImageFormat.Bmp); var psdMatrix = psd.ToArray().ToMatrix(); // calculate the log of matrix var logPsd = MatrixTools.Matrix2LogValues(psdMatrix); Csv.WriteMatrixToCsv(new FileInfo(@"C:\Users\kholghim\Mahnoosh\Liz\PowerSpectrumDensity\logPsd.csv"), logPsd); var image = DecibelSpectrogram.DrawSpectrogramAnnotated(logPsd, settings, attributes); image.Save(resultPsdPath); var noiseReducedLogPsd = PcaWhitening.NoiseReduction(logPsd); //SNR.NoiseReduce_Standard(logPsd); //SNR.NoiseReduce_Mean(logPsd, 0.0);//SNR.NoiseReduce_Median(logPsd, 0.0); // Csv.WriteMatrixToCsv(new FileInfo(@"C:\Users\kholghim\Mahnoosh\Liz\PowerSpectrumDensity\logPsd_NoiseReduced.csv"), logPsd); var image2 = DecibelSpectrogram.DrawSpectrogramAnnotated(noiseReducedLogPsd, settings, attributes); image2.Save(resultNoiseReducedPsdPath); //ImageTools.DrawMatrix(psd.ToArray().ToMatrix(), resultPath); //ImageTools.DrawReversedMatrix(psd.ToArray().ToMatrix(), resultPath); //var data = MatrixTools.Matrix2LogValues(psd.ToArray().ToMatrix()); //Image image = ImageTools.DrawReversedMatrixWithoutNormalisation(data); //Image image = ImageTools.DrawReversedMatrixWithoutNormalisation(logPsd); }
/// IMPORTANT NOTE: If you are converting Herz scale from LINEAR to OCTAVE, this conversion MUST be done BEFORE noise reduction /// <summary> /// CONSTRUCTION OF Frequency Scales /// WARNING!: Changing the constants for the octave scales will have undefined effects. /// The options below have been debugged to give what is required. /// However other values have not been debugged - so user should check the output to ensure it is what is required. /// </summary> public static void GetOctaveScale(FrequencyScale scale) { int finalBinCount = 256; int sr, frameSize, octaveDivisions; // NOTE: octaveDivisions = the number of fractional Hz steps within one octave. Piano octave contains 12 steps per octave. FreqScaleType fst = scale.ScaleType; switch (fst) { case FreqScaleType.Linear62Octaves7Tones31Nyquist11025: // constants required for split linear-octave scale when sr = 22050 sr = 22050; frameSize = 8192; scale.OctaveCount = 7; octaveDivisions = 31; // tone steps within one octave. Note: piano = 12 steps per octave. scale.LinearBound = 62; scale.Nyquist = 11025; break; case FreqScaleType.Linear125Octaves6Tones30Nyquist11025: // constants required for split linear-octave scale when sr = 22050 sr = 22050; frameSize = 8192; scale.OctaveCount = 6; octaveDivisions = 32; // tone steps within one octave. Note: piano = 12 steps per octave. scale.LinearBound = 125; scale.Nyquist = 11025; break; case FreqScaleType.Octaves24Nyquist32000: //// constants required for full octave scale when sr = 64000 sr = 64000; frameSize = 16384; scale.OctaveCount = 8; octaveDivisions = 24; // tone steps within one octave. Note: piano = 12 steps per octave. scale.LinearBound = 15; scale.Nyquist = 32000; break; case FreqScaleType.Linear125Octaves7Tones28Nyquist32000: // constants required for split linear-octave scale when sr = 64000 sr = 64000; frameSize = 16384; // = 2*8192 or 4*4096; scale.OctaveCount = 7; octaveDivisions = 28; // tone steps within one octave. Note: piano = 12 steps per octave. scale.LinearBound = 125; scale.Nyquist = 32000; break; default: LoggedConsole.WriteErrorLine("WARNING: UNKNOWN OCTAVE SCALE."); return; } scale.WindowSize = frameSize; // = 2*8192 or 4*4096 scale.FinalBinCount = finalBinCount; scale.ToneCount = octaveDivisions; scale.BinBounds = LinearToSplitLinearOctaveScale(sr, frameSize, finalBinCount, scale.LinearBound, scale.Nyquist, scale.ToneCount); scale.GridLineLocations = GetGridLineLocations(fst, scale.BinBounds); }
public void TestFeatureLearning() { // var outputDir = this.outputDirectory; var resultDir = PathHelper.ResolveAssetPath("FeatureLearning"); var folderPath = Path.Combine(resultDir, "random_audio_segments"); // Liz // PathHelper.ResolveAssetPath(@"C:\Users\kholghim\Mahnoosh\PcaWhitening\random_audio_segments\1192_1000"); // var resultDir = PathHelper.ResolveAssetPath(@"C:\Users\kholghim\Mahnoosh\PcaWhitening"); var outputMelImagePath = Path.Combine(resultDir, "MelScaleSpectrogram.png"); var outputNormMelImagePath = Path.Combine(resultDir, "NormalizedMelScaleSpectrogram.png"); var outputNoiseReducedMelImagePath = Path.Combine(resultDir, "NoiseReducedMelSpectrogram.png"); var outputReSpecImagePath = Path.Combine(resultDir, "ReconstrcutedSpectrogram.png"); // var outputClusterImagePath = Path.Combine(resultDir, "Clusters.bmp"); // +++++++++++++++++++++++++++++++++++++++++++++++++patch sampling from 1000 random 1-min recordings from Gympie // check whether there is any file in the folder/subfolders if (Directory.GetFiles(folderPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } // get the nyquist value from the first wav file in the folder of recordings int nq = new AudioRecording(Directory.GetFiles(folderPath, "*.wav")[0]).Nyquist; int nyquist = nq; // 11025; int frameSize = 1024; int finalBinCount = 128; // 256; // 100; // 40; // 200; // int hertzInterval = 1000; FreqScaleType scaleType = FreqScaleType.Mel; var freqScale = new FrequencyScale(scaleType, nyquist, frameSize, finalBinCount, hertzInterval); var fst = freqScale.ScaleType; var sonoConfig = new SonogramConfig { WindowSize = frameSize, // since each 24 frames duration is equal to 1 second WindowOverlap = 0.1028, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, }; /* * // testing * var recordingPath3 = PathHelper.ResolveAsset(folderPath, "SM304264_0+1_20160421_024539_46-47min.wav"); * var recording3 = new AudioRecording(recordingPath3); * var sonogram3 = new SpectrogramStandard(sonoConfig, recording3.WavReader); * * // DO DRAW SPECTROGRAM * var image4 = sonogram3.GetImageFullyAnnotated(sonogram3.GetImage(), "MELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); * image4.Save(outputMelImagePath); * * // Do RMS normalization * sonogram3.Data = SNR.RmsNormalization(sonogram3.Data); * var image5 = sonogram3.GetImageFullyAnnotated(sonogram3.GetImage(), "NORMALISEDMELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); * image5.Save(outputNormMelImagePath); * * // NOISE REDUCTION * sonogram3.Data = PcaWhitening.NoiseReduction(sonogram3.Data); * var image6 = sonogram3.GetImageFullyAnnotated(sonogram3.GetImage(), "NOISEREDUCEDMELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); * image6.Save(outputNoiseReducedMelImagePath); * * //testing */ // Define the minFreBin and MaxFreqBin to be able to work at arbitrary frequency bin bounds. // The default value is minFreqBin = 1 and maxFreqBin = finalBinCount. // To work with arbitrary frequency bin bounds we need to manually set these two parameters. int minFreqBin = 40; //1 int maxFreqBin = 80; //finalBinCount; int numFreqBand = 1; //4; int patchWidth = (maxFreqBin - minFreqBin + 1) / numFreqBand; // finalBinCount / numFreqBand; int patchHeight = 1; // 2; // 4; // 16; // 6; // Frame size int numRandomPatches = 20; // 40; // 80; // 30; // 100; // 500; // // int fileCount = Directory.GetFiles(folderPath, "*.wav").Length; // Define variable number of "randomPatch" lists based on "numFreqBand" Dictionary <string, List <double[, ]> > randomPatchLists = new Dictionary <string, List <double[, ]> >(); for (int i = 0; i < numFreqBand; i++) { randomPatchLists.Add(string.Format("randomPatch{0}", i.ToString()), new List <double[, ]>()); } List <double[, ]> randomPatches = new List <double[, ]>(); /* * foreach (string filePath in Directory.GetFiles(folderPath, "*.wav")) * { * FileInfo f = filePath.ToFileInfo(); * if (f.Length == 0) * { * Debug.WriteLine(f.Name); * } * } */ double[,] inputMatrix; foreach (string filePath in Directory.GetFiles(folderPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); sonoConfig.SourceFName = recording.BaseName; var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); // DO RMS NORMALIZATION sonogram.Data = SNR.RmsNormalization(sonogram.Data); // DO NOISE REDUCTION // sonogram.Data = SNR.NoiseReduce_Median(sonogram.Data, nhBackgroundThreshold: 2.0); sonogram.Data = PcaWhitening.NoiseReduction(sonogram.Data); // check whether the full band spectrogram is needed or a matrix with arbitrary freq bins if (minFreqBin != 1 || maxFreqBin != finalBinCount) { inputMatrix = PatchSampling.GetArbitraryFreqBandMatrix(sonogram.Data, minFreqBin, maxFreqBin); } else { inputMatrix = sonogram.Data; } // creating matrices from different freq bands of the source spectrogram List <double[, ]> allSubmatrices = PatchSampling.GetFreqBandMatrices(inputMatrix, numFreqBand); // Second: selecting random patches from each freq band matrix and add them to the corresponding patch list int count = 0; while (count < allSubmatrices.Count) { randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling .GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, numRandomPatches, PatchSampling.SamplingMethod.Random).ToMatrix()); count++; } } } foreach (string key in randomPatchLists.Keys) { randomPatches.Add(PatchSampling.ListOf2DArrayToOne2DArray(randomPatchLists[key])); } // convert list of random patches matrices to one matrix int numberOfClusters = 50; //256; // 128; // 64; // 32; // 10; // List <double[][]> allBandsCentroids = new List <double[][]>(); List <KMeansClusterCollection> allClusteringOutput = new List <KMeansClusterCollection>(); for (int i = 0; i < randomPatches.Count; i++) { double[,] patchMatrix = randomPatches[i]; // Apply PCA Whitening var whitenedSpectrogram = PcaWhitening.Whitening(true, patchMatrix); // Do k-means clustering var clusteringOutput = KmeansClustering.Clustering(whitenedSpectrogram.Reversion, numberOfClusters); // var clusteringOutput = KmeansClustering.Clustering(patchMatrix, noOfClusters, pathToClusterCsvFile); // writing centroids to a csv file // note that Csv.WriteToCsv can't write data types like dictionary<int, double[]> (problems with arrays) // I converted the dictionary values to a matrix and used the Csv.WriteMatrixToCsv // it might be a better way to do this string pathToClusterCsvFile = Path.Combine(resultDir, "ClusterCentroids" + i.ToString() + ".csv"); var clusterCentroids = clusteringOutput.ClusterIdCentroid.Values.ToArray(); Csv.WriteMatrixToCsv(pathToClusterCsvFile.ToFileInfo(), clusterCentroids.ToMatrix()); //Csv.WriteToCsv(pathToClusterCsvFile.ToFileInfo(), clusterCentroids); // sorting clusters based on size and output it to a csv file Dictionary <int, double> clusterIdSize = clusteringOutput.ClusterIdSize; int[] sortOrder = KmeansClustering.SortClustersBasedOnSize(clusterIdSize); // Write cluster ID and size to a CSV file string pathToClusterSizeCsvFile = Path.Combine(resultDir, "ClusterSize" + i.ToString() + ".csv"); Csv.WriteToCsv(pathToClusterSizeCsvFile.ToFileInfo(), clusterIdSize); // Draw cluster image directly from clustering output List <KeyValuePair <int, double[]> > list = clusteringOutput.ClusterIdCentroid.ToList(); double[][] centroids = new double[list.Count][]; for (int j = 0; j < list.Count; j++) { centroids[j] = list[j].Value; } allBandsCentroids.Add(centroids); allClusteringOutput.Add(clusteringOutput.Clusters); List <double[, ]> allCentroids = new List <double[, ]>(); for (int k = 0; k < centroids.Length; k++) { // convert each centroid to a matrix in order of cluster ID // double[,] cent = PatchSampling.ArrayToMatrixByColumn(centroids[i], patchWidth, patchHeight); // OR: in order of cluster size double[,] cent = MatrixTools.ArrayToMatrixByColumn(centroids[sortOrder[k]], patchWidth, patchHeight); // normalize each centroid double[,] normCent = DataTools.normalise(cent); // add a row of zero to each centroid double[,] cent2 = PatchSampling.AddRow(normCent); allCentroids.Add(cent2); } // concatenate all centroids double[,] mergedCentroidMatrix = PatchSampling.ListOf2DArrayToOne2DArray(allCentroids); // Draw clusters // int gridInterval = 1000; // var freqScale = new FrequencyScale(FreqScaleType.Mel, nyquist, frameSize, finalBinCount, gridInterval); var clusterImage = ImageTools.DrawMatrixWithoutNormalisation(mergedCentroidMatrix); clusterImage.RotateFlip(RotateFlipType.Rotate270FlipNone); // clusterImage.Save(outputClusterImagePath, ImageFormat.Bmp); var outputClusteringImage = Path.Combine(resultDir, "ClustersWithGrid" + i.ToString() + ".bmp"); // Image bmp = Image.Load<Rgb24>(filename); FrequencyScale.DrawFrequencyLinesOnImage((Image <Rgb24>)clusterImage, freqScale, includeLabels: false); clusterImage.Save(outputClusteringImage); } //+++++++++++++++++++++++++++++++++++++++++++++++++++++Processing and generating features for the target recordings var recording2Path = PathHelper.ResolveAsset("Recordings", "BAC2_20071008-085040.wav"); // var recording2Path = PathHelper.ResolveAsset(folderPath, "gympie_np_1192_353972_20160303_055854_60_0.wav"); // folder with 1000 files // var recording2Path = PathHelper.ResolveAsset(folderPath, "gympie_np_1192_353887_20151230_042625_60_0.wav"); // folder with 1000 files // var recording2Path = PathHelper.ResolveAsset(folderPath, "gympie_np_1192_354744_20151018_053923_60_0.wav"); // folder with 100 files var recording2 = new AudioRecording(recording2Path); var sonogram2 = new SpectrogramStandard(sonoConfig, recording2.WavReader); // DO DRAW SPECTROGRAM var image = sonogram2.GetImageFullyAnnotated(sonogram2.GetImage(), "MELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); image.Save(outputMelImagePath); // Do RMS normalization sonogram2.Data = SNR.RmsNormalization(sonogram2.Data); var image2 = sonogram2.GetImageFullyAnnotated(sonogram2.GetImage(), "NORMALISEDMELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); image2.Save(outputNormMelImagePath); // NOISE REDUCTION sonogram2.Data = PcaWhitening.NoiseReduction(sonogram2.Data); var image3 = sonogram2.GetImageFullyAnnotated(sonogram2.GetImage(), "NOISEREDUCEDMELSPECTROGRAM: " + fst.ToString(), freqScale.GridLineLocations); image3.Save(outputNoiseReducedMelImagePath); // check whether the full band spectrogram is needed or a matrix with arbitrary freq bins if (minFreqBin != 1 || maxFreqBin != finalBinCount) { inputMatrix = PatchSampling.GetArbitraryFreqBandMatrix(sonogram2.Data, minFreqBin, maxFreqBin); } else { inputMatrix = sonogram2.Data; } // extracting sequential patches from the target spectrogram List <double[, ]> allSubmatrices2 = PatchSampling.GetFreqBandMatrices(inputMatrix, numFreqBand); double[][,] matrices2 = allSubmatrices2.ToArray(); List <double[, ]> allSequentialPatchMatrix = new List <double[, ]>(); for (int i = 0; i < matrices2.GetLength(0); i++) { int rows = matrices2[i].GetLength(0); int columns = matrices2[i].GetLength(1); var sequentialPatches = PatchSampling.GetPatches(matrices2[i], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential); allSequentialPatchMatrix.Add(sequentialPatches.ToMatrix()); } // +++++++++++++++++++++++++++++++++++Feature Transformation // to do the feature transformation, we normalize centroids and // sequential patches from the input spectrogram to unit length // Then, we calculate the dot product of each patch with the centroids' matrix List <double[][]> allNormCentroids = new List <double[][]>(); for (int i = 0; i < allBandsCentroids.Count; i++) { // double check the index of the list double[][] normCentroids = new double[allBandsCentroids.ToArray()[i].GetLength(0)][]; for (int j = 0; j < allBandsCentroids.ToArray()[i].GetLength(0); j++) { normCentroids[j] = ART_2A.NormaliseVector(allBandsCentroids.ToArray()[i][j]); } allNormCentroids.Add(normCentroids); } List <double[][]> allFeatureTransVectors = new List <double[][]>(); for (int i = 0; i < allSequentialPatchMatrix.Count; i++) { double[][] featureTransVectors = new double[allSequentialPatchMatrix.ToArray()[i].GetLength(0)][]; for (int j = 0; j < allSequentialPatchMatrix.ToArray()[i].GetLength(0); j++) { var normVector = ART_2A.NormaliseVector(allSequentialPatchMatrix.ToArray()[i] .ToJagged()[j]); // normalize each patch to unit length featureTransVectors[j] = allNormCentroids.ToArray()[i].ToMatrix().Dot(normVector); } allFeatureTransVectors.Add(featureTransVectors); } // +++++++++++++++++++++++++++++++++++Feature Transformation // +++++++++++++++++++++++++++++++++++Temporal Summarization // The resolution to generate features is 1 second // Each 24 single-frame patches form 1 second // for each 24 patch, we generate 3 vectors of mean, std, and max // The pre-assumption is that each input spectrogram is 1 minute List <double[, ]> allMeanFeatureVectors = new List <double[, ]>(); List <double[, ]> allMaxFeatureVectors = new List <double[, ]>(); List <double[, ]> allStdFeatureVectors = new List <double[, ]>(); // number of frames needs to be concatenated to form 1 second. Each 24 frames make 1 second. int numFrames = (24 / patchHeight) * 60; foreach (var freqBandFeature in allFeatureTransVectors) { // store features of different bands in lists List <double[]> meanFeatureVectors = new List <double[]>(); List <double[]> maxFeatureVectors = new List <double[]>(); List <double[]> stdFeatureVectors = new List <double[]>(); int c = 0; while (c + numFrames < freqBandFeature.GetLength(0)) { // First, make a list of patches that would be equal to 1 second List <double[]> sequencesOfFramesList = new List <double[]>(); for (int i = c; i < c + numFrames; i++) { sequencesOfFramesList.Add(freqBandFeature[i]); } List <double> mean = new List <double>(); List <double> std = new List <double>(); List <double> max = new List <double>(); double[,] sequencesOfFrames = sequencesOfFramesList.ToArray().ToMatrix(); // int len = sequencesOfFrames.GetLength(1); // Second, calculate mean, max, and standard deviation of six vectors element-wise for (int j = 0; j < sequencesOfFrames.GetLength(1); j++) { double[] temp = new double[sequencesOfFrames.GetLength(0)]; for (int k = 0; k < sequencesOfFrames.GetLength(0); k++) { temp[k] = sequencesOfFrames[k, j]; } mean.Add(AutoAndCrossCorrelation.GetAverage(temp)); std.Add(AutoAndCrossCorrelation.GetStdev(temp)); max.Add(temp.GetMaxValue()); } meanFeatureVectors.Add(mean.ToArray()); maxFeatureVectors.Add(max.ToArray()); stdFeatureVectors.Add(std.ToArray()); c += numFrames; } allMeanFeatureVectors.Add(meanFeatureVectors.ToArray().ToMatrix()); allMaxFeatureVectors.Add(maxFeatureVectors.ToArray().ToMatrix()); allStdFeatureVectors.Add(stdFeatureVectors.ToArray().ToMatrix()); } // +++++++++++++++++++++++++++++++++++Temporal Summarization // ++++++++++++++++++++++++++++++++++Writing features to file // First, concatenate mean, max, std for each second. // Then write to CSV file. for (int j = 0; j < allMeanFeatureVectors.Count; j++) { // write the features of each pre-defined frequency band into a separate CSV file var outputFeatureFile = Path.Combine(resultDir, "FeatureVectors" + j.ToString() + ".csv"); // creating the header for CSV file List <string> header = new List <string>(); for (int i = 0; i < allMeanFeatureVectors.ToArray()[j].GetLength(1); i++) { header.Add("mean" + i.ToString()); } for (int i = 0; i < allMaxFeatureVectors.ToArray()[j].GetLength(1); i++) { header.Add("max" + i.ToString()); } for (int i = 0; i < allStdFeatureVectors.ToArray()[j].GetLength(1); i++) { header.Add("std" + i.ToString()); } // concatenating mean, std, and max vector together for each 1 second List <double[]> featureVectors = new List <double[]>(); for (int i = 0; i < allMeanFeatureVectors.ToArray()[j].ToJagged().GetLength(0); i++) { List <double[]> featureList = new List <double[]> { allMeanFeatureVectors.ToArray()[j].ToJagged()[i], allMaxFeatureVectors.ToArray()[j].ToJagged()[i], allStdFeatureVectors.ToArray()[j].ToJagged()[i], }; double[] featureVector = DataTools.ConcatenateVectors(featureList); featureVectors.Add(featureVector); } // writing feature vectors to CSV file using (StreamWriter file = new StreamWriter(outputFeatureFile)) { // writing the header to CSV file foreach (var entry in header.ToArray()) { file.Write(entry + ","); } file.Write(Environment.NewLine); foreach (var entry in featureVectors.ToArray()) { foreach (var value in entry) { file.Write(value + ","); } file.Write(Environment.NewLine); } } } /* * // Reconstructing the target spectrogram based on clusters' centroids * List<double[,]> convertedSpec = new List<double[,]>(); * int columnPerFreqBand = sonogram2.Data.GetLength(1) / numFreqBand; * for (int i = 0; i < allSequentialPatchMatrix.Count; i++) * { * double[,] reconstructedSpec2 = KmeansClustering.ReconstructSpectrogram(allSequentialPatchMatrix.ToArray()[i], allClusteringOutput.ToArray()[i]); * convertedSpec.Add(PatchSampling.ConvertPatches(reconstructedSpec2, patchWidth, patchHeight, columnPerFreqBand)); * } * * sonogram2.Data = PatchSampling.ConcatFreqBandMatrices(convertedSpec); * * // DO DRAW SPECTROGRAM * var reconstructedSpecImage = sonogram2.GetImageFullyAnnotated(sonogram2.GetImage(), "RECONSTRUCTEDSPECTROGRAM: " + freqScale.ScaleType.ToString(), freqScale.GridLineLocations); * reconstructedSpecImage.Save(outputReSpecImagePath); */ }
public void TestKmeansClustering() { var outputDir = this.outputDirectory; var recordingsPath = PathHelper.ResolveAssetPath("FeatureLearning"); var folderPath = Path.Combine(recordingsPath, "random_audio_segments"); var outputImagePath = Path.Combine(outputDir.FullName, "ReconstrcutedSpectrogram.png"); // check whether there is any file in the folder/subfolders if (Directory.GetFiles(folderPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty. Test will fail!"); } // get the nyquist value from the first wav file in the folder of recordings int nq = new AudioRecording(Directory.GetFiles(folderPath, "*.wav")[0]).Nyquist; int nyquist = nq; int frameSize = 1024; int finalBinCount = 128; int hertzInterval = 1000; FreqScaleType scaleType = FreqScaleType.Mel; var freqScale = new FrequencyScale(scaleType, nyquist, frameSize, finalBinCount, hertzInterval); var sonoConfig = new SonogramConfig { WindowSize = frameSize, //WindowOverlap is set based on the fact that each 24 frames is equal to 1 second WindowOverlap = 0.1028, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, }; int numberOfFreqBand = 4; int patchWidth = finalBinCount / numberOfFreqBand; int patchHeight = 1; int numberOfRandomPatches = 20; // Define variable number of "randomPatch" lists based on "numberOfFreqBand" Dictionary <string, List <double[, ]> > randomPatchLists = new Dictionary <string, List <double[, ]> >(); for (int i = 0; i < numberOfFreqBand; i++) { randomPatchLists.Add(string.Format("randomPatch{0}", i.ToString()), new List <double[, ]>()); } List <double[, ]> randomPatches = new List <double[, ]>(); foreach (string filePath in Directory.GetFiles(folderPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); sonoConfig.SourceFName = recording.BaseName; var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); // DO RMS NORMALIZATION sonogram.Data = SNR.RmsNormalization(sonogram.Data); // DO NOISE REDUCTION sonogram.Data = PcaWhitening.NoiseReduction(sonogram.Data); // creating matrices from different freq bands of the source spectrogram List <double[, ]> allSubmatrices = PatchSampling.GetFreqBandMatrices(sonogram.Data, numberOfFreqBand); // Second: selecting random patches from each freq band matrix and add them to the corresponding patch list int count = 0; while (count < allSubmatrices.Count) { randomPatchLists[string.Format("randomPatch{0}", count.ToString())].Add(PatchSampling.GetPatches(allSubmatrices.ToArray()[count], patchWidth, patchHeight, numberOfRandomPatches, PatchSampling.SamplingMethod.Random).ToMatrix()); count++; } } } foreach (string key in randomPatchLists.Keys) { randomPatches.Add(PatchSampling.ListOf2DArrayToOne2DArray(randomPatchLists[key])); } // convert list of random patches matrices to one matrix int numberOfClusters = 32; List <KMeansClusterCollection> allClusteringOutput = new List <KMeansClusterCollection>(); for (int i = 0; i < randomPatches.Count; i++) { double[,] patchMatrix = randomPatches[i]; // Do k-means clustering string pathToClusterCsvFile = Path.Combine(outputDir.FullName, "ClusterCentroids" + i.ToString() + ".csv"); var clusteringOutput = KmeansClustering.Clustering(patchMatrix, numberOfClusters); // sorting clusters based on size and output it to a csv file Dictionary <int, double> clusterIdSize = clusteringOutput.ClusterIdSize; int[] sortOrder = KmeansClustering.SortClustersBasedOnSize(clusterIdSize); // Write cluster ID and size to a CSV file string pathToClusterSizeCsvFile = Path.Combine(outputDir.FullName, "ClusterSize" + i.ToString() + ".csv"); Csv.WriteToCsv(pathToClusterSizeCsvFile.ToFileInfo(), clusterIdSize); // Draw cluster image directly from clustering output List <KeyValuePair <int, double[]> > listCluster = clusteringOutput.ClusterIdCentroid.ToList(); double[][] centroids = new double[listCluster.Count][]; for (int j = 0; j < listCluster.Count; j++) { centroids[j] = listCluster[j].Value; } allClusteringOutput.Add(clusteringOutput.Clusters); List <double[, ]> allCentroids = new List <double[, ]>(); for (int k = 0; k < centroids.Length; k++) { // convert each centroid to a matrix in order of cluster ID // OR: in order of cluster size double[,] centroid = MatrixTools.ArrayToMatrixByColumn(centroids[sortOrder[k]], patchWidth, patchHeight); // normalize each centroid double[,] normalizedCentroid = DataTools.normalise(centroid); // add a row of zero to each centroid double[,] newCentroid = PatchSampling.AddRow(normalizedCentroid); allCentroids.Add(newCentroid); } // concatenate all centroids double[,] mergedCentroidMatrix = PatchSampling.ListOf2DArrayToOne2DArray(allCentroids); // Draw clusters var clusterImage = ImageTools.DrawMatrixWithoutNormalisation(mergedCentroidMatrix); clusterImage.RotateFlip(RotateFlipType.Rotate270FlipNone); var outputClusteringImage = Path.Combine(outputDir.FullName, "ClustersWithGrid" + i.ToString() + ".bmp"); FrequencyScale.DrawFrequencyLinesOnImage((Bitmap)clusterImage, freqScale, includeLabels: false); clusterImage.Save(outputClusteringImage); } //+++++++++++++++++++++++++++++++++++++++++++Reconstructing a target spectrogram from sequential patches and the cluster centroids var recording2Path = PathHelper.ResolveAsset("Recordings", "BAC2_20071008-085040.wav"); var recording2 = new AudioRecording(recording2Path); var sonogram2 = new SpectrogramStandard(sonoConfig, recording2.WavReader); var targetSpec = sonogram2.Data; // Do RMS normalization sonogram2.Data = SNR.RmsNormalization(sonogram2.Data); // NOISE REDUCTION sonogram2.Data = PcaWhitening.NoiseReduction(sonogram2.Data); // extracting sequential patches from the target spectrogram List <double[, ]> allSubmatrices2 = PatchSampling.GetFreqBandMatrices(sonogram2.Data, numberOfFreqBand); double[][,] matrices2 = allSubmatrices2.ToArray(); List <double[, ]> allSequentialPatchMatrix = new List <double[, ]>(); for (int i = 0; i < matrices2.GetLength(0); i++) { int rows = matrices2[i].GetLength(0); int columns = matrices2[i].GetLength(1); var sequentialPatches = PatchSampling.GetPatches(matrices2[i], patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential); allSequentialPatchMatrix.Add(sequentialPatches.ToMatrix()); } List <double[, ]> convertedSpectrogram = new List <double[, ]>(); int columnPerFreqBand = sonogram2.Data.GetLength(1) / numberOfFreqBand; for (int i = 0; i < allSequentialPatchMatrix.Count; i++) { double[,] reconstructedSpec2 = KmeansClustering.ReconstructSpectrogram(allSequentialPatchMatrix.ToArray()[i], allClusteringOutput.ToArray()[i]); convertedSpectrogram.Add(PatchSampling.ConvertPatches(reconstructedSpec2, patchWidth, patchHeight, columnPerFreqBand)); } sonogram2.Data = PatchSampling.ConcatFreqBandMatrices(convertedSpectrogram); // DO DRAW SPECTROGRAM var reconstructedSpecImage = sonogram2.GetImageFullyAnnotated(sonogram2.GetImage(), "RECONSTRUCTEDSPECTROGRAM: " + freqScale.ScaleType.ToString(), freqScale.GridLineLocations); reconstructedSpecImage.Save(outputImagePath, ImageFormat.Png); // DO UNIT TESTING Assert.AreEqual(targetSpec.GetLength(0), sonogram2.Data.GetLength(0)); Assert.AreEqual(targetSpec.GetLength(1), sonogram2.Data.GetLength(1)); }
/// <summary> /// Apply feature learning process on a set of patch sampling set in an unsupervised manner /// Output clusters /// </summary> public static List <KmeansClustering.Output> UnsupervisedFeatureLearning(FeatureLearningSettings config, string inputPath) { // check whether there is any file in the folder/subfolders if (Directory.GetFiles(inputPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } int frameSize = config.FrameSize; int finalBinCount = config.FinalBinCount; FreqScaleType scaleType = config.FrequencyScaleType; var settings = new SpectrogramSettings() { WindowSize = frameSize, // the duration of each frame (according to the default value (i.e., 1024) of frame size) is 0.04644 seconds // The question is how many single-frames (i.e., patch height is equal to 1) should be selected to form one second // The "WindowOverlap" is calculated to answer this question // each 24 single-frames duration is equal to 1 second // note that the "WindowOverlap" value should be recalculated if frame size is changed // this has not yet been considered in the Config file! WindowOverlap = 0.10725204, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; double frameStep = frameSize * (1 - settings.WindowOverlap); int minFreqBin = config.MinFreqBin; int maxFreqBin = config.MaxFreqBin; int numFreqBand = config.NumFreqBand; int patchWidth = (maxFreqBin - minFreqBin + 1) / numFreqBand; int patchHeight = config.PatchHeight; int numRandomPatches = config.NumRandomPatches; // Define variable number of "randomPatch" lists based on "numFreqBand" Dictionary <string, List <double[, ]> > randomPatchLists = new Dictionary <string, List <double[, ]> >(); for (int i = 0; i < numFreqBand; i++) { randomPatchLists.Add($"randomPatch{i.ToString()}", new List <double[, ]>()); } List <double[, ]> randomPatches = new List <double[, ]>(); double[,] inputMatrix; List <AudioRecording> recordings = new List <AudioRecording>(); foreach (string filePath in Directory.GetFiles(inputPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); settings.SourceFileName = recording.BaseName; if (config.DoSegmentation) { recordings = PatchSampling.GetSubsegmentsSamples(recording, config.SubsegmentDurationInSeconds, frameStep); } else { recordings.Add(recording); } for (int i = 0; i < recordings.Count; i++) { var amplitudeSpectrogram = new AmplitudeSpectrogram(settings, recordings[i].WavReader); var decibelSpectrogram = new DecibelSpectrogram(amplitudeSpectrogram); // DO RMS NORMALIZATION //sonogram.Data = SNR.RmsNormalization(sonogram.Data); if (config.DoNoiseReduction) { decibelSpectrogram.Data = PcaWhitening.NoiseReduction(decibelSpectrogram.Data); } // check whether the full band spectrogram is needed or a matrix with arbitrary freq bins if (minFreqBin != 1 || maxFreqBin != finalBinCount) { inputMatrix = PatchSampling.GetArbitraryFreqBandMatrix(decibelSpectrogram.Data, minFreqBin, maxFreqBin); } else { inputMatrix = decibelSpectrogram.Data; } // creating matrices from different freq bands of the source spectrogram List <double[, ]> allSubmatrices = PatchSampling.GetFreqBandMatrices(inputMatrix, numFreqBand); // selecting random patches from each freq band matrix and add them to the corresponding patch list int count = 0; while (count < allSubmatrices.Count) { // downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling double[,] downsampledMatrix = MaxPooling(allSubmatrices.ToArray()[count], config.MaxPoolingFactor); randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling .GetPatches(downsampledMatrix, patchWidth, patchHeight, numRandomPatches, PatchSampling.SamplingMethod.Random).ToMatrix()); count++; } } } } foreach (string key in randomPatchLists.Keys) { randomPatches.Add(PatchSampling.ListOf2DArrayToOne2DArray(randomPatchLists[key])); } // convert list of random patches matrices to one matrix int numClusters = config.NumClusters; List <KmeansClustering.Output> allClusteringOutput = new List <KmeansClustering.Output>(); for (int i = 0; i < randomPatches.Count; i++) { double[,] patchMatrix = randomPatches[i]; // Apply PCA Whitening var whitenedSpectrogram = PcaWhitening.Whitening(config.DoWhitening, patchMatrix); // Do k-means clustering var clusteringOutput = KmeansClustering.Clustering(whitenedSpectrogram.Reversion, numClusters); allClusteringOutput.Add(clusteringOutput); } return(allClusteringOutput); }
/// <summary> /// This method is called semi-supervised feature learning because one of the clusters is formed using /// the positive frames manually selected from 1-min recordings. /// The input to this methods is a group of files that contains the call of interest, /// a 2D-array that contains file name, the second number and the corresponding frame numbers in each file. /// At the moment, this method only handles single-frames as patches (PatchHeight = 1). /// </summary> public static List <KmeansClustering.Output> SemisupervisedFeatureLearning(FeatureLearningSettings config, string inputPath, string[,] frameInfo) { // making a dictionary of frame info as file name and second number as key, and start and end frame number as value. Dictionary <Tuple <string, int>, int[]> info = new Dictionary <Tuple <string, int>, int[]>(); for (int i = 0; i < frameInfo.GetLength(0); i++) { Tuple <string, int> keys = new Tuple <string, int>(frameInfo[i, 0], Convert.ToInt32(frameInfo[i, 1])); int[] values = new int[2] { Convert.ToInt32(frameInfo[i, 2]), Convert.ToInt32(frameInfo[i, 3]) }; info.Add(keys, values); } // processing the recordings within the input path // check whether there is any file in the folder/subfolders if (Directory.GetFiles(inputPath, "*", SearchOption.AllDirectories).Length == 0) { throw new ArgumentException("The folder of recordings is empty..."); } int frameSize = config.FrameSize; int finalBinCount = config.FinalBinCount; FreqScaleType scaleType = config.FrequencyScaleType; var settings = new SpectrogramSettings() { WindowSize = frameSize, // the duration of each frame (according to the default value (i.e., 1024) of frame size) is 0.04644 seconds // The question is how many single-frames (i.e., patch height is equal to 1) should be selected to form one second // The "WindowOverlap" is calculated to answer this question // each 24 single-frames duration is equal to 1 second // note that the "WindowOverlap" value should be recalculated if frame size is changed // this has not yet been considered in the Config file! WindowOverlap = 0.10725204, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, NoiseReductionParameter = 0.0, }; double frameStep = frameSize * (1 - settings.WindowOverlap); int minFreqBin = config.MinFreqBin; int maxFreqBin = config.MaxFreqBin; int numFreqBand = config.NumFreqBand; int patchWidth = (maxFreqBin - minFreqBin + 1) / numFreqBand; int patchHeight = config.PatchHeight; int numRandomPatches = config.NumRandomPatches; // Define variable number of "randomPatch" lists based on "numFreqBand" Dictionary <string, List <double[, ]> > randomPatchLists = new Dictionary <string, List <double[, ]> >(); Dictionary <string, List <double[, ]> > sequentialPatchLists = new Dictionary <string, List <double[, ]> >(); for (int i = 0; i < numFreqBand; i++) { randomPatchLists.Add($"randomPatch{i.ToString()}", new List <double[, ]>()); sequentialPatchLists.Add($"sequentialPatch{i.ToString()}", new List <double[, ]>()); } List <double[, ]> randomPatches = new List <double[, ]>(); List <double[, ]> positivePatches = new List <double[, ]>(); double[,] inputMatrix; List <AudioRecording> recordings = new List <AudioRecording>(); foreach (string filePath in Directory.GetFiles(inputPath, "*.wav")) { FileInfo fileInfo = filePath.ToFileInfo(); // process the wav file if it is not empty if (fileInfo.Length != 0) { var recording = new AudioRecording(filePath); settings.SourceFileName = recording.BaseName; if (config.DoSegmentation) { recordings = PatchSampling.GetSubsegmentsSamples(recording, config.SubsegmentDurationInSeconds, frameStep); } else { recordings.Add(recording); } for (int i = 0; i < recordings.Count; i++) { var amplitudeSpectrogram = new AmplitudeSpectrogram(settings, recordings[i].WavReader); var decibelSpectrogram = new DecibelSpectrogram(amplitudeSpectrogram); if (config.DoNoiseReduction) { decibelSpectrogram.Data = PcaWhitening.NoiseReduction(decibelSpectrogram.Data); } // check whether the full band spectrogram is needed or a matrix with arbitrary freq bins if (minFreqBin != 1 || maxFreqBin != finalBinCount) { inputMatrix = PatchSampling.GetArbitraryFreqBandMatrix(decibelSpectrogram.Data, minFreqBin, maxFreqBin); } else { inputMatrix = decibelSpectrogram.Data; } // creating matrices from different freq bands of the source spectrogram List <double[, ]> allSubmatrices = PatchSampling.GetFreqBandMatrices(inputMatrix, numFreqBand); // check whether the file has any positive frame List <int> positiveFrameNumbers = new List <int>(); foreach (var entry in info) { // check whether the file and the current second (i) has positive frame if ((fileInfo.Name == entry.Key.Item1) && (i == entry.Key.Item2)) { // make a list of frame numbers for (int j = entry.Value[0]; j <= entry.Value[1]; j++) { positiveFrameNumbers.Add(j); } } } // making two matrices, one from positive frames and one from negative frames. List <double[, ]> allPositiveFramesSubmatrices = new List <double[, ]>(); List <double[, ]> allNegativeFramesSubmatrices = new List <double[, ]>(); List <int> negativeFrameNumbers = new List <int>(); for (int j = 1; j <= 24; j++) { bool flag = false; foreach (var number in positiveFrameNumbers) { if (j == number) { flag = true; break; } } // if flag is false, it means that the frame does not contain a part of bird call and should be added // to the negativeFrameNumbers list. if (!flag) { negativeFrameNumbers.Add(j); } } if (positiveFrameNumbers.ToArray().Length != 0) { foreach (var submatrix in allSubmatrices) { List <double[]> positiveFrames = new List <double[]>(); foreach (var number in positiveFrameNumbers) { positiveFrames.Add(submatrix.ToJagged()[number - 1]); } allPositiveFramesSubmatrices.Add(positiveFrames.ToArray().ToMatrix()); List <double[]> negativeFrames = new List <double[]>(); foreach (var number in negativeFrameNumbers) { negativeFrames.Add(submatrix.ToJagged()[number - 1]); } allNegativeFramesSubmatrices.Add(positiveFrames.ToArray().ToMatrix()); } } else { allNegativeFramesSubmatrices = allSubmatrices; } // selecting random patches from each freq band matrix and add them to the corresponding patch list int count = 0; while (count < allNegativeFramesSubmatrices.Count) { // select random patches from those segments that do not contain the call of interest if (allPositiveFramesSubmatrices.Count != 0) { // downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling double[,] downsampledPositiveMatrix = MaxPooling(allPositiveFramesSubmatrices.ToArray()[count], config.MaxPoolingFactor); int rows = downsampledPositiveMatrix.GetLength(0); int columns = downsampledPositiveMatrix.GetLength(1); sequentialPatchLists[$"sequentialPatch{count.ToString()}"].Add( PatchSampling.GetPatches(downsampledPositiveMatrix, patchWidth, patchHeight, (rows / patchHeight) * (columns / patchWidth), PatchSampling.SamplingMethod.Sequential).ToMatrix()); } else { // downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling double[,] downsampledNegativeMatrix = MaxPooling(allNegativeFramesSubmatrices.ToArray()[count], config.MaxPoolingFactor); randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling .GetPatches(downsampledNegativeMatrix, patchWidth, patchHeight, numRandomPatches, PatchSampling.SamplingMethod.Random).ToMatrix()); } /* * We can use this block of code instead of line 384 to 389, if we want to select random patches from negative frames of the segments with call of interest * // downsampling the input matrix by a factor of n (MaxPoolingFactor) using max pooling * double[,] downsampledNegativeMatrix = MaxPooling(allNegativeFramesSubmatrices.ToArray()[count], config.MaxPoolingFactor); * if (downsampledNegativeMatrix.GetLength(0) < numRandomPatches) * { * int numR = downsampledNegativeMatrix.GetLength(0); * int numC = downsampledNegativeMatrix.GetLength(1); * randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling * .GetPatches(downsampledNegativeMatrix, patchWidth, patchHeight, * (numR / patchHeight) * (numC / patchWidth), * PatchSampling.SamplingMethod.Sequential).ToMatrix()); * } * else * { * randomPatchLists[$"randomPatch{count.ToString()}"].Add(PatchSampling * .GetPatches(downsampledNegativeMatrix, patchWidth, patchHeight, numRandomPatches, * PatchSampling.SamplingMethod.Random).ToMatrix()); * } */ count++; } } } } foreach (string key in sequentialPatchLists.Keys) { positivePatches.Add(PatchSampling.ListOf2DArrayToOne2DArray(sequentialPatchLists[key])); } foreach (string key in randomPatchLists.Keys) { randomPatches.Add(PatchSampling.ListOf2DArrayToOne2DArray(randomPatchLists[key])); } // convert list of random patches matrices to one matrix int numClusters = config.NumClusters - 1; List <KmeansClustering.Output> semisupervisedClusteringOutput = new List <KmeansClustering.Output>(); List <KmeansClustering.Output> unsupervisedClusteringOutput = new List <KmeansClustering.Output>(); List <KmeansClustering.Output> supervisedClusteringOutput = new List <KmeansClustering.Output>(); // clustering of random patches for (int i = 0; i < randomPatches.Count; i++) { double[,] patchMatrix = randomPatches[i]; // Apply PCA Whitening var whitenedSpectrogram = PcaWhitening.Whitening(config.DoWhitening, patchMatrix); // Do k-means clustering var clusteringOutput = KmeansClustering.Clustering(whitenedSpectrogram.Reversion, numClusters); unsupervisedClusteringOutput.Add(clusteringOutput); } // build one cluster out of positive frames for (int i = 0; i < positivePatches.Count; i++) { double[,] patchMatrix = positivePatches[i]; // Apply PCA Whitening var whitenedSpectrogram = PcaWhitening.Whitening(config.DoWhitening, patchMatrix); // Do k-means clustering // build one cluster from positive patches var clusteringOutput = KmeansClustering.Clustering(whitenedSpectrogram.Reversion, 1); supervisedClusteringOutput.Add(clusteringOutput); } // merge the output of two clustering obtained from supervised and unsupervised approaches var positiveClusterId = config.NumClusters - 1; List <double[][]> positiveCentroids = new List <double[][]>(); List <double[]> positiveClusterSize = new List <double[]>(); foreach (var output in supervisedClusteringOutput) { positiveCentroids.Add(output.ClusterIdCentroid.Values.ToArray()); positiveClusterSize.Add(output.ClusterIdSize.Values.ToArray()); } semisupervisedClusteringOutput = unsupervisedClusteringOutput; for (int i = 0; i < semisupervisedClusteringOutput.Count; i++) { semisupervisedClusteringOutput[i].ClusterIdCentroid.Add(positiveClusterId, positiveCentroids[i][0]); semisupervisedClusteringOutput[i].ClusterIdSize.Add(positiveClusterId, positiveClusterSize[i][0]); } return(semisupervisedClusteringOutput); }
public static void Execute(Arguments arguments) { // this is a generic command for testing // input should be only one-minute wav file // read in the config file // pass the config to the algorithm // output the results var configPath = @"SpectralPeakTrackingConfig.yml"; var recordingPath = @""; var imagePath = @""; var configFile = configPath.ToFileInfo(); if (configFile == null) { throw new FileNotFoundException("No config file argument provided"); } else if (!configFile.Exists) { throw new ArgumentException($"Config file {configFile.FullName} not found"); } var configuration = ConfigFile.Deserialize <SpectralPeakTrackingConfig>(configFile); var recording = new AudioRecording(recordingPath); // get the nyquist value from the recording int nyquist = new AudioRecording(recordingPath).Nyquist; int frameSize = configuration.FrameWidth; double frameOverlap = configuration.FrameOverlap; int finalBinCount = 512; var hertzPerFreqBin = nyquist / finalBinCount; FreqScaleType scaleType = FreqScaleType.Linear; var sonoConfig = new SonogramConfig { WindowSize = frameSize, WindowOverlap = frameOverlap, DoMelScale = (scaleType == FreqScaleType.Mel) ? true : false, MelBinCount = (scaleType == FreqScaleType.Mel) ? finalBinCount : frameSize / 2, NoiseReductionType = NoiseReductionType.None, }; //var sonogram = new SpectrogramStandard(sonoConfig, recording.WavReader); var amplitudeSpectrogram = new AmplitudeSonogram(sonoConfig, recording.WavReader); // Broken in merge b7e03070a9cd72ab0632789a3412967a6cc54cd6 //var energySpectrogram = new EnergySpectrogram(amplitudeSpectrogram); var decibelSpectrogram = new SpectrogramStandard(sonoConfig, recording.WavReader); double frameStepSize = sonoConfig.GetFrameOffset(); double stepDuration = frameStepSize / (nyquist * 2); // Noise Reduction to be added //var output = SpectralPeakTracking2018.SpectralPeakTracking(energySpectrogram.Data, configuration.SptSettings, hertzPerFreqBin); // draw the local peaks //double[,] hits = SpectralPeakTracking2018.MakeHitMatrix(energySpectrogram.Data, output.TargetPeakBinsIndex, output.BandIndex); //var image = SpectralPeakTracking2018.DrawSonogram(decibelSpectrogram, hits); //image.Save(imagePath, ImageFormat.Bmp); }