private void Recognize(ref string result, int sampleRate) { amplitudeSum = 0.0f; for (int i = 0; i < playingAudioSpectrum.Length; ++i) { amplitudeSum += playingAudioSpectrum[i]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(playingAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = sampleRate / windowSize; for (int i = 0; i < formantArray.Length; ++i) { formantArray[i] = peakPositions[i] * frequencyUnit; } for (int i = 0; i < currentVowelFormantCeilValues.Length; ++i) { if (formantArray[0] > currentVowelFormantCeilValues[i]) { result = currentVowels[i]; } } } }
protected void Init(ERecognizerLanguage recognizingLanguage, int windowSize, float amplitudeThreshold) { switch (recognizingLanguage) { case ERecognizerLanguage.Japanese: currentVowels = vowelsByFormantJP; currentVowelFormantCeilValues = vowelFormantFloorJP; break; case ERecognizerLanguage.Chinese: currentVowels = vowelsByFormantCN; currentVowelFormantCeilValues = vowelFormantFloorCN; break; } this.windowSize = Mathf.ClosestPowerOfTwo(windowSize); this.amplitudeThreshold = amplitudeThreshold; this.smoothedAudioSpectrum = new float[this.windowSize]; this.peakValues = new float[FORMANT_COUNT]; this.peakPositions = new int[FORMANT_COUNT]; this.formantArray = new float[FORMANT_COUNT]; this.gaussianFilter = MathToolBox.GenerateGaussianFilter(FILTER_SIZE, FILTER_DEVIATION_SQUARE); }
private List <double[]> Formants(List <float[]> splitting) { int i = 0; float a = 0.67f; List <double[]> ret = new List <double[]>(); while (i < splitting.Count()) { float[] FL = PreEmphasis(splitting[i], a); float[] w = MathToolBox.GenerateWindow(window, MathToolBox.EWindowType.Hamming); for (int j = 0; j < window; j++) { FL[j] = FL[j] * w[j]; } Debug.Log(i); var coefficients = Estimate(FL, 2 + fs / 1000); var rts = FindCRoots(coefficients).Where(x => x.imag >= 0.0); var frqs = rts.Select(x => x.arg * (fs / (2 * Mathf.PI))).ToList(); frqs.Sort(); double[] fmts = { frqs[1], frqs[2] }; ret.Add(fmts); i++; } return(ret); }
/// <summary> /// Convolute data and filter. Result is sent to output, which must not be shorter than data. /// </summary> /// <param name="output">Array to store output. Must not be shorter than data.</param> /// <param name="data">Source data array.</param> /// <param name="filter">Filter array.</param> /// <param name="paddleType">Paddle type.</param> public static void Convolute(float[] data, float[] filter, EPaddleType paddleType, float[] output) { int filterMiddlePoint = Mathf.FloorToInt(filter.Length / 2); for (int n = 0; n < data.Length; ++n) { output[n] = 0.0f; for (int m = 0; m < filter.Length; ++m) { output[n] += MathToolBox.GetValueFromArray(data, n - filterMiddlePoint + m, paddleType) * filter[filter.Length - m - 1]; } } }
public Complex[] FindCRoots(IEnumerable <double> dpoly) { int len = dpoly.Count(); int len2 = (len - 1) * 2; double[] ret = new double[len2]; MathToolBox.poly_roots(len, dpoly.Reverse().ToArray(), ret); Complex[] cpx = new Complex[len - 1]; for (int i = 0; i < len - 1; i++) { cpx[i] = new Complex(ret[2 * i], ret[2 * i + 1]); } return(cpx); }
public string[] RecognizeAllByAudioClip(AudioClip audioClip) { int recognizeSampleCount = Mathf.CeilToInt((float)(audioClip.samples) / (float)(shiftStepSize)); string[] result = new string[recognizeSampleCount]; float[] currentAudioData = new float[this.windowSize]; float[] currentAudioSpectrum = new float[this.windowSize]; for (int i = 0; i < recognizeSampleCount; ++i) { audioClip.GetData(currentAudioData, i * shiftStepSize); for (int j = 0; j < windowSize; ++j) { currentAudioData[j] *= windowArray[j]; } currentAudioSpectrum = MathToolBox.DiscreteCosineTransform(currentAudioData); amplitudeSum = 0.0f; for (int k = 0; k < windowSize; ++k) { amplitudeSum += currentAudioSpectrum[k]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(currentAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = audioClip.frequency / 2 / windowSize; for (int l = 0; l < formantArray.Length; ++l) { formantArray[l] = peakPositions[l] * frequencyUnit; } for (int m = 0; m < currentVowelFormantCeilValues.Length; ++m) { if (formantArray[0] > currentVowelFormantCeilValues[m]) { result[i] = currentVowels[m]; } } } } return(result); }
private void Formant(List <float[]> splitting) { int i = 0; float a = 0.67f; var lpc = new LpcModel(); info = String.Empty; while (i < splitting.Count()) { var FL = PreEmphasis(splitting[i], a); var w = MathToolBox.GenerateWindow(window, MathToolBox.EWindowType.Hamming); for (int j = 0; j < window; j++) { FL[i] = FL[i] * w[i]; } var coefficients = lpc.EstimateLpcCoefficients(FL, 2 + fs / 1000); var formants = lpc.FindFormants(coefficients, fs); AppendInfo(i, formants); i++; } }
public LipSyncOfflineRecognizer(ERecognizerLanguage recognizingLanguage, float amplitudeThreshold, int windowSize, int shiftStepSize) { base.Init(recognizingLanguage, windowSize, amplitudeThreshold); this.shiftStepSize = shiftStepSize; this.windowArray = MathToolBox.GenerateWindow(windowSize, MathToolBox.EWindowType.Hamming); }