/// <summary> /// Computes a Scms model from the MFCC representation of a song. /// </summary> /// <param name="mfcc">Comirva.Audio.Util.Maths.Matrix mfcc</param> /// <returns></returns> public static Scms GetScms(Comirva.Audio.Util.Maths.Matrix mfccs, string name) { DbgTimer t = new DbgTimer(); t.Start(); Comirva.Audio.Util.Maths.Matrix mean = mfccs.Mean(2); #if DEBUG if (Analyzer.DEBUG_INFO_VERBOSE) { if (Analyzer.DEBUG_OUTPUT_TEXT) mean.WriteText(name + "_mean.txt"); mean.DrawMatrixGraph(name + "_mean.png"); } #endif // Covariance Comirva.Audio.Util.Maths.Matrix covarMatrix = mfccs.Cov(mean); #if DEBUG if (Analyzer.DEBUG_INFO_VERBOSE) { if (Analyzer.DEBUG_OUTPUT_TEXT) covarMatrix.WriteText(name + "_covariance.txt"); covarMatrix.DrawMatrixGraph(name + "_covariance.png"); } #endif // Inverse Covariance Comirva.Audio.Util.Maths.Matrix covarMatrixInv; try { covarMatrixInv = covarMatrix.InverseGausJordan(); } catch (Exception) { Dbg.WriteLine("MatrixSingularException - Scms failed!"); return null; } #if DEBUG if (Analyzer.DEBUG_INFO_VERBOSE) { if (Analyzer.DEBUG_OUTPUT_TEXT) covarMatrixInv.WriteAscii(name + "_inverse_covariance.ascii"); covarMatrixInv.DrawMatrixGraph(name + "_inverse_covariance.png"); } #endif // Store the Mean, Covariance, Inverse Covariance in an optimal format. int dim = mean.Rows; Scms s = new Scms(dim); int l = 0; for (int i = 0; i < dim; i++) { s.mean[i] = (float) mean.MatrixData[i][0]; for (int j = i; j < dim; j++) { s.cov[l] = (float) covarMatrix.MatrixData[i][j]; s.icov[l] = (float) covarMatrixInv.MatrixData[i][j]; l++; } } Dbg.WriteLine("Compute Scms - Execution Time: {0} ms", t.Stop().TotalMilliseconds); return s; }
/// <summary> /// Computes the perceptual hash of an audio file as a bitstring using the mfcc matrix /// </summary> /// <param name="mfcc">mfcc Matrix</param> /// <returns>Returns a 'binary string' (aka bitstring) (like. 001010111011100010) which is easy to do a hamming distance on.</returns> private static string GetBitString(Comirva.Audio.Util.Maths.Matrix mfcc) { int rows = mfcc.Rows; int columns = mfcc.Columns; // 5. Compute the average value. // Compute the mean DCT value (using only // the 8x8 DCT low-frequency values and excluding the first term // since the DC coefficient can be significantly different from // the other values and will throw off the average). double total = 0; for (int x = 0; x < rows; x++) { for (int y = 0; y < columns; y++) { total += mfcc.MatrixData[x][y]; } } total -= mfcc.MatrixData[0][0]; double avg = total / (double)((rows * columns) - 1); // 6. Further reduce the DCT. // This is the magic step. Set the 64 hash bits to 0 or 1 // depending on whether each of the 64 DCT values is above or // below the average value. The result doesn't tell us the // actual low frequencies; it just tells us the very-rough // relative scale of the frequencies to the mean. The result // will not vary as long as the overall structure of the image // remains the same; this can survive gamma and color histogram // adjustments without a problem. string hash = ""; for (int x = 0; x < rows; x++) { for (int y = 0; y < columns; y++) { if (x != 0 && y != 0) { hash += (mfcc.MatrixData[x][y] > avg ? "1" : "0"); } } } return hash; }
/// <summary> /// Add the log spectrogram matrix as a Statistical Cluster Model Similarity class to the database /// </summary> /// <param name="logSpectrogramMatrix">log spectrogram matrix</param> /// <param name="fileName">clean filename without extension</param> /// <param name="fullFilePath">full file path</param> /// <param name="duration">duration in ms</param> /// <param name="db">database</param> /// <param name="trackId">track id to insert</param> /// <param name="doOutputDebugInfo">decide whether to output debug info like spectrogram and audiofile (default value can be set)</param> /// <param name="useHaarWavelet">decide whether to use haar wavelet compression or DCT compression</param> /// <returns>true if successful</returns> private static bool AnalyseAndAddScmsUsingLogSpectrogram(Comirva.Audio.Util.Maths.Matrix logSpectrogramMatrix, WorkUnitParameterObject param, Db db, int trackId, bool doOutputDebugInfo=DEFAULT_DEBUG_INFO, bool useHaarWavelet = true) { // Insert Statistical Cluster Model Similarity Audio Feature string fileName = param.FileName; Comirva.Audio.Util.Maths.Matrix scmsMatrix = null; if (useHaarWavelet) { #region Wavelet Transform int lastHeight = 0; int lastWidth = 0; scmsMatrix = mfccMirage.ApplyWaveletCompression(ref logSpectrogramMatrix, out lastHeight, out lastWidth); #if DEBUG if (DEBUG_INFO_VERBOSE) { if (DEBUG_OUTPUT_TEXT) scmsMatrix.WriteAscii(fileName + "_waveletdata.ascii"); } #endif if (doOutputDebugInfo) { scmsMatrix.DrawMatrixImageLogValues(fileName + "_waveletdata.png", true); } #if DEBUG if (DEBUG_DO_INVERSE_TESTS) { #region Inverse Wavelet // try to do an inverse wavelet transform Comirva.Audio.Util.Maths.Matrix stftdata_inverse_wavelet = mfccMirage.InverseWaveletCompression(ref scmsMatrix, lastHeight, lastWidth, logSpectrogramMatrix.Rows, logSpectrogramMatrix.Columns); if (DEBUG_OUTPUT_TEXT) stftdata_inverse_wavelet.WriteCSV(fileName + "_specgramlog_inverse_wavelet.csv", ";"); stftdata_inverse_wavelet.DrawMatrixImageLogValues(fileName + "_specgramlog_inverse_wavelet.png", true); #endregion } #endif #endregion } else { #region DCT Transform // It seems the Mirage way of applying the DCT is slightly faster than the // Comirva way due to less loops scmsMatrix = mfccMirage.ApplyDCT(ref logSpectrogramMatrix); #if DEBUG if (DEBUG_INFO_VERBOSE) { if (DEBUG_OUTPUT_TEXT) scmsMatrix.WriteAscii(fileName + "_mfccdata.ascii"); } #endif if (doOutputDebugInfo) { scmsMatrix.DrawMatrixImageLogValues(fileName + "_mfccdata.png", true); } #if DEBUG if (DEBUG_DO_INVERSE_TESTS) { #region Inverse MFCC // try to do an inverse mfcc Comirva.Audio.Util.Maths.Matrix stftdata_inverse_mfcc = mfccMirage.InverseDCT(ref scmsMatrix); if (DEBUG_OUTPUT_TEXT) stftdata_inverse_mfcc.WriteCSV(fileName + "_stftdata_inverse_mfcc.csv", ";"); stftdata_inverse_mfcc.DrawMatrixImageLogValues(fileName + "_specgramlog_inverse_mfcc.png", true); #endregion } #endif #endregion } // Store in a Statistical Cluster Model Similarity class. // i.e. a Gaussian representation of a song Scms audioFeature = Scms.GetScms(scmsMatrix, fileName); if (audioFeature != null) { // Store image if debugging if (doOutputDebugInfo) { audioFeature.Image = scmsMatrix.DrawMatrixImageLogValues(fileName + "_featuredata.png", true, false, 0, 0, true); } // Store bitstring hash as well string hashString = GetBitString(scmsMatrix); audioFeature.BitString = hashString; // Store duration audioFeature.Duration = (long) param.DurationInMs; // Store file name audioFeature.Name = param.PathToAudioFile; // Add to database int id = trackId; if (db.AddTrack(ref id, audioFeature) == -1) { Console.Out.WriteLine("Failed! Could not add audio feature to database ({0})!", fileName); return false; } else { return true; } } else { Console.Out.WriteLine("Error! Could not compute the Scms for '{0}'!", fileName); return false; } }
public void ComputeInverseComirvaMatrixUsingLomontTableFFT(Comirva.Audio.Util.Maths.Matrix m, int column, ref double[] signal, int winsize, int hopsize) { double[] spectrogramWindow = m.GetColumn(column); // extend window with the inverse duplicate array int len = spectrogramWindow.Length; double[] extendedWindow = new double[len * 2]; Array.Copy(spectrogramWindow, extendedWindow, len); for (int i = 1; i < len; i++) { extendedWindow[len+i] = spectrogramWindow[len-i]; } double[] complexSignal = FFTUtilsLomont.DoubleToComplexDouble(extendedWindow); lomonFFT.TableFFT(complexSignal, false); double[] window = win.GetWindow(); // multiply by window w/ overlap-add int N = complexSignal.Length / 2; double[] returnArray = new double[N]; for (int j = 0; j < N; j++) { double re = complexSignal[2*j] / Math.Sqrt(winsize); //double img = complexSignal[2*j + 1]; returnArray[j] = re * window[j]; // smooth yet another time (also did this when doing FFT) // overlap-add method // scale with 2 just because the volume got so much lower when using a second smoothing filter when reconstrcting signal[j+hopsize*column] = signal[j+hopsize*column] + returnArray[j] * 2; } }
public void ComputeInverseComirvaMatrixUsingLomontRealFFT(Comirva.Audio.Util.Maths.Matrix m, int column, ref double[] signal, int winsize, int hopsize) { double[] spectrogramWindow = m.GetColumn(column); // extend window with the inverse duplicate array int len = spectrogramWindow.Length; double[] extendedWindow = new double[len * 2]; Array.Copy(spectrogramWindow, extendedWindow, len); for (int i = 1; i < len; i++) { extendedWindow[len+i] = spectrogramWindow[len-i]; } // ifft input must contain the FFT values // r0, r(n/2), r1, i1, r2, i2 ... // Perform the ifft and take just the real part double[] ifft = new double[winsize*2]; ifft[0] = extendedWindow[0]; ifft[1] = extendedWindow[winsize/2]; for (int i = 1; i < extendedWindow.Length; i++) { ifft[2 * i] = extendedWindow[i]; } lomonFFT.RealFFT(ifft, false); double[] window = win.GetWindow(); // multiply by window w/ overlap-add int N = ifft.Length / 2; double[] returnArray = new double[N]; for (int j = 0; j < N; j++) { double re = ifft[2*j] / Math.Sqrt(winsize); returnArray[j] = re * window[j]; // smooth yet another time (also did this when doing FFT) // overlap-add method // scale with 5 just because the volume got so much lower when using a second smoothing filter when reconstrcting signal[j+hopsize*column] = signal[j+hopsize*column] + returnArray[j] * 5; } }
public void ComputeComirvaMatrixUsingLomontTableFFT(ref Comirva.Audio.Util.Maths.Matrix m, int column, float[] audiodata, int pos) { // apply the window method (e.g HammingWindow, HannWindow etc) win.Apply(ref data, audiodata, pos); double[] complexSignal = FFTUtilsLomont.FloatToComplexDouble(data); lomonFFT.TableFFT(complexSignal, true); int row = 0; for (int i = 0; i < complexSignal.Length/4; i += 2) { double re = complexSignal[2*i]; double img = complexSignal[2*i + 1]; m.MatrixData[row][column] = Math.Sqrt( (re*re + img*img) * complexSignal.Length/2); row++; } }
public void ComputeComirvaMatrixUsingLomontRealFFT(ref Comirva.Audio.Util.Maths.Matrix m, int column, float[] audiodata, int pos) { // apply the window method (e.g HammingWindow, HannWindow etc) win.Apply(ref data, audiodata, pos); double[] fft = new double[data.Length/2]; Array.Copy(data, fft, data.Length/2); lomonFFT.RealFFT(fft, true); // fft input will now contain the FFT values // r0, r(n/2), r1, i1, r2, i2 ... m.MatrixData[0][column] = Math.Sqrt(fft[0] * fft[0] * winsize); m.MatrixData[winsize/2-1][column] = Math.Sqrt(fft[1] * fft[1] * winsize); for (int row = 1; row < winsize/2; row++) { // amplitude (or magnitude) is the square root of the power spectrum // the magnitude spectrum is abs(fft), i.e. Math.Sqrt(re*re + img*img) // use 20*log10(Y) to get dB from amplitude // the power spectrum is the magnitude spectrum squared // use 10*log10(Y) to get dB from power spectrum m.MatrixData[row][column] = Math.Sqrt((fft[2 * row] * fft[2 * row] + fft[2 * row + 1] * fft[2 * row + 1]) * winsize); } }
public void ComputeComirvaMatrixUsingFftw(ref Comirva.Audio.Util.Maths.Matrix m, int j, float[] audiodata, int pos) { // apply the window method (e.g HammingWindow, HannWindow etc) win.Apply(ref data, audiodata, pos); Marshal.Copy(data, 0, fftwData, fftsize); fftwf_execute(fftwPlan); Marshal.Copy(fftwData, fft, 0, fftsize); // fft input will now contain the FFT values in a Half Complex format // r0, r1, r2, ..., rn/2, i(n+1)/2-1, ..., i2, i1 // Here, rk is the real part of the kth output, and ikis the imaginary part. (Division by 2 is rounded down.) // For a halfcomplex array hc[n], the kth component thus has its real part in hc[k] and its imaginary part in hc[n-k], // with the exception of k == 0 or n/2 (the latter only if n is even)—in these two cases, the imaginary part is zero due to symmetries of the real-input DFT, and is not stored. m.MatrixData[0][j] = Math.Sqrt(fft[0] * fft[0]); for (int i = 1; i < winsize/2; i++) { // amplitude (or magnitude) is the square root of the power spectrum // the magnitude spectrum is abs(fft), i.e. Math.Sqrt(re*re + img*img) // use 20*log10(Y) to get dB from amplitude // the power spectrum is the magnitude spectrum squared // use 10*log10(Y) to get dB from power spectrum m.MatrixData[i][j] = Math.Sqrt((fft[i * 2]* fft[i * 2] + fft[fftsize - i * 2] * fft[fftsize - i * 2])); } //m.MatrixData[winsize/2][j] = Math.Sqrt(fft[winsize] * fft[winsize]); }
/// <summary> /// Computes a Scms model from the MFCC representation of a song. /// </summary> /// <param name="mfcc">Comirva.Audio.Util.Maths.Matrix mfcc</param> /// <returns></returns> public static Scms GetScmsNoInverse(Comirva.Audio.Util.Maths.Matrix mfccs, string name) { DbgTimer t = new DbgTimer(); t.Start(); Comirva.Audio.Util.Maths.Matrix mean = mfccs.Mean(2); #if DEBUG if (Analyzer.DEBUG_INFO_VERBOSE) { if (Analyzer.DEBUG_OUTPUT_TEXT) mean.WriteText(name + "_mean.txt"); mean.DrawMatrixGraph(name + "_mean.png"); } #endif // Covariance Comirva.Audio.Util.Maths.Matrix covarMatrix = mfccs.Cov(mean); #if DEBUG if (Analyzer.DEBUG_INFO_VERBOSE) { if (Analyzer.DEBUG_OUTPUT_TEXT) covarMatrix.WriteText(name + "_covariance.txt"); covarMatrix.DrawMatrixGraph(name + "_covariance.png"); } #endif Comirva.Audio.Util.Maths.Matrix covarMatrixInv = new Comirva.Audio.Util.Maths.Matrix(covarMatrix.Rows, covarMatrix.Columns); // Store the Mean, Covariance, Inverse Covariance in an optimal format. int dim = mean.Rows; Scms s = new Scms(dim); int l = 0; for (int i = 0; i < dim; i++) { s.mean[i] = (float) mean.MatrixData[i][0]; for (int j = i; j < dim; j++) { s.cov[l] = (float) covarMatrix.MatrixData[i][j]; s.icov[l] = (float) covarMatrixInv.MatrixData[i][j]; l++; } } Dbg.WriteLine("GetScmsNoInverse - Execution Time: {0} ms", t.Stop().TotalMilliseconds); return s; }