public List <SpectralImage> CreateLogSpectrogram(AudioSamples audioSamples, SpectrogramConfig configuration) { using (new DebugTimer("CreateLogSpectrogram()")) { int wdftSize = configuration.WdftSize; int width = (audioSamples.Samples.Length - wdftSize) / configuration.Overlap; if (width < 1) { return(new List <SpectralImage>()); } float[] frames = new float[width * configuration.LogBins]; ushort[] logFrequenciesIndexes = logUtility.GenerateLogFrequenciesRanges(audioSamples.SampleRate, configuration); float[] window = configuration.Window.GetWindow(wdftSize); float[] samples = audioSamples.Samples; // PIN: reverted the following FFT to use lomontFFT with managed code (not the unsafe changed made by the original author due to the issues on my computers) // NOTE! When using Parallell.For the result becomes different from time to time // when running in Release mode. // Therefore make sure to use for loop instead for (int index = 0; index < width; index++) // Parallel.For(0, width, index => { var fftArray = CopyAndWindow(samples, index * configuration.Overlap, window); lomontFFT.RealFFT(fftArray, true); // after the lomont realfft the fft input array will contain the FFT values // r0, r(n/2), r1, i1, r2, i2 ... // since the extract log bins method only uses lowBound index above 2 we can ignore the fact // that the first and second values are "special": r0, r(n/2) // see https://github.com/perivar/FindSimilar/blob/6b658b1c54d1504136e25e933f39b7c303da5d9e/Mirage/Fft.cs ExtractLogBins(fftArray, logFrequenciesIndexes, configuration.LogBins, wdftSize, frames, index); } // ); if (configuration.Verbosity == Verbosity.Verbose) { var imageService = new FindSimilarImageService(); using (Image image = imageService.GetSpectrogramImage(frames, width, configuration.LogBins, width, configuration.LogBins)) { var fileName = Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(audioSamples.Origin) + "_spectrogram.png")); if (fileName != null) { image.Save(fileName, ImageFormat.Png); } } WriteOutputUtils.WriteCSV(frames, Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(audioSamples.Origin) + "_frames.csv"))); } var spectralImages = CutLogarithmizedSpectrum(frames, audioSamples.SampleRate, configuration); if (configuration.Verbosity == Verbosity.Verbose) { if (spectralImages.Count > 0) { var spectralImageList = new List <float[]>(); foreach (var spectralImage in spectralImages) { spectralImageList.Add(spectralImage.Image); } var spectralImageArray = spectralImageList.ToArray(); WriteOutputUtils.WriteCSV(spectralImageArray, Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(audioSamples.Origin) + "_spectral_images.csv")), ";"); } } ScaleFullSpectrum(spectralImages, configuration); return(spectralImages); } }
public List <HashedFingerprint> CreateFingerprints(AudioSamples samples, FingerprintConfiguration configuration) { // Explode samples to the range of 16 bit shorts (–32,768 to 32,767) // Matlab multiplies with 2^15 (32768) const int AUDIO_MULTIPLIER = 65536; // 32768 still makes alot of mfcc feature computations fail! // Explode samples to the range of 16 bit shorts (–32,768 to 32,767) // Matlab multiplies with 2^15 (32768) // e.g. if( max(abs(speech))<=1 ), speech = speech * 2^15; end; float[] audiodata = samples.Samples; MathUtils.Multiply(ref audiodata, AUDIO_MULTIPLIER); // zero pad if the audio file is too short to perform a fft if (audiodata.Length < (configuration.SpectrogramConfig.WdftSize + configuration.SpectrogramConfig.Overlap)) { int lenNew = configuration.SpectrogramConfig.WdftSize + configuration.SpectrogramConfig.Overlap; Array.Resize <float>(ref audiodata, lenNew); } samples.Samples = audiodata; if (configuration.SpectrogramConfig.Verbosity == Verbosity.Verbose) { WriteOutputUtils.WriteCSV(audiodata, Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(samples.Origin) + "_audiodata.csv"))); } // create log spectrogram var spectralImages = spectrumService.CreateLogSpectrogram(samples, configuration.SpectrogramConfig); if (configuration.SpectrogramConfig.Verbosity == Verbosity.Verbose) { if (spectralImages.Count > 0) { var imageService = new FindSimilarImageService(); using (Image image = imageService.GetLogSpectralImages(spectralImages, spectralImages.Count > 5 ? 5 : spectralImages.Count)) { var fileName = Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(samples.Origin) + "_spectral_images.png")); if (fileName != null) { image.Save(fileName, ImageFormat.Png); } } } } var fingerprints = CreateFingerprintsFromLogSpectrum(spectralImages, configuration); if (configuration.SpectrogramConfig.Verbosity == Verbosity.Verbose) { if (fingerprints.Count > 0) { var imageService = new FindSimilarImageService(); using (Image image = imageService.GetImageForFingerprints(fingerprints, 128, 32, fingerprints.Count > 5 ? 5 : fingerprints.Count)) { var fileName = Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(samples.Origin) + "_fingerprints.png")); if (fileName != null) { image.Save(fileName, ImageFormat.Png); } } } } var hashedFingerprints = HashFingerprints(fingerprints, configuration); if (configuration.SpectrogramConfig.Verbosity == Verbosity.Verbose) { if (hashedFingerprints.Count > 0) { var hashedFingerprintList = new List <int[]>(); foreach (var hashedFingerprint in hashedFingerprints) { hashedFingerprintList.Add(hashedFingerprint.HashBins); } var hashedFingerprinArray = hashedFingerprintList.ToArray(); WriteOutputUtils.WriteCSV(hashedFingerprinArray, Path.Combine(SoundFingerprinter.DEBUG_DIRECTORY_PATH, (Path.GetFileNameWithoutExtension(samples.Origin) + "_hashbins.csv")), ";"); } } return(hashedFingerprints); }