public string RecognizeByAudioSource(AudioSource audioSource) { string result = null; audioSource.GetOutputData(playingAudioData, 0); audioSource.GetSpectrumData(playingAudioSpectrum, 0, FFTWindow.BlackmanHarris); if (audioSource.isPlaying == true) { amplitudeSum = 0.0f; for (int i = 0; i < playingAudioSpectrum.Length; ++i) { amplitudeSum += playingAudioSpectrum[i]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(playingAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = audioSource.clip.frequency / 2 / windowSize; for (int i = 0; i < formantArray.Length; ++i) { formantArray[i] = peakPositions[i] * frequencyUnit; } // TODO: Recognization by multiple formant switch (recognizingLanguage) { case ERecognizerLanguage.Japanese: currentVowels = vowelsByFormantJP; currentVowelFormantCeilValues = vowelFormantFloorJP; break; case ERecognizerLanguage.Chinese: currentVowels = vowelsByFormantCN; currentVowelFormantCeilValues = vowelFormantFloorCN; break; } for (int i = 0; i < currentVowelFormantCeilValues.Length; ++i) { if (Average(formantArray) > currentVowelFormantCeilValues[i]) { result = currentVowels[i]; //Debug.Log(Average(formantArray)); } } } else { result = null; } } else { result = null; } return(result); }
public LipSyncRuntimeRecognizer(ERecognizerLanguage recognizingLanguage, int windowSize, float amplitudeThreshold) { this.recognizingLanguage = recognizingLanguage; this.windowSize = Mathf.ClosestPowerOfTwo(windowSize); this.playingAudioData = new float[this.windowSize]; this.playingAudioSpectrum = new float[this.windowSize]; this.amplitudeThreshold = amplitudeThreshold; this.gaussianFilter = MathToolBox.GenerateGaussianFilter(FILTER_SIZE, FILTER_DEVIATION_SQUARE); this.smoothedAudioSpectrum = new float[this.windowSize]; this.peakValues = new float[FORMANT_COUNT]; this.peakPositions = new int[FORMANT_COUNT]; this.formantArray = new float[FORMANT_COUNT]; }
/// <summary> /// Convolute data and filter. Result is sent to output, which must not be shorter than data. /// </summary> /// <param name="output">Array to store output. Must not be shorter than data.</param> /// <param name="data">Source data array.</param> /// <param name="filter">Filter array.</param> /// <param name="paddleType">Paddle type.</param> public static void Convolute(float[] data, float[] filter, EPaddleType paddleType, float[] output) { int filterMiddlePoint = Mathf.FloorToInt(filter.Length / 2); for (int n = 0; n < data.Length; ++n) { output[n] = 0.0f; for (int m = 0; m < filter.Length; ++m) { output[n] += MathToolBox.GetValueFromArray(data, n - filterMiddlePoint + m, paddleType) * filter[filter.Length - m - 1]; } } }
public LipSyncOfflineRecognizer(ERecognizerLanguage recognizingLanguage, float amplitudeThreshold, int windowSize, int shiftStepSize) { this.recognizingLanguage = recognizingLanguage; this.windowSize = Mathf.ClosestPowerOfTwo(windowSize); this.shiftStepSize = shiftStepSize; this.amplitudeThreshold = amplitudeThreshold; this.gaussianFilter = MathToolBox.GenerateGaussianFilter(FILTER_SIZE, FILTER_DEVIATION_SQUARE); this.windowArray = MathToolBox.GenerateWindow(windowSize, MathToolBox.EWindowType.Hamming); this.smoothedAudioSpectrum = new float[this.windowSize]; this.peakValues = new float[FORMANT_COUNT]; this.peakPositions = new int[FORMANT_COUNT]; this.formantArray = new float[FORMANT_COUNT]; }
/// <summary> /// /// </summary> /// <param name="audioClip"></param> /// <returns></returns> public string[] RecognizeAllByAudioClip(AudioClip audioClip) { int recognizeSampleCount = Mathf.CeilToInt((float)(audioClip.samples) / (float)(shiftStepSize)); string[] result = new string[recognizeSampleCount]; float[] currentAudioData = new float[this.windowSize]; float[] currentAudioSpectrum = new float[this.windowSize]; for (int i = 0; i < recognizeSampleCount; ++i) { audioClip.GetData(currentAudioData, i * shiftStepSize); for (int j = 0; j < windowSize; ++j) { currentAudioData[j] *= windowArray[j]; } currentAudioSpectrum = MathToolBox.DiscreteCosineTransform(currentAudioData); amplitudeSum = 0.0f; for (int k = 0; k < windowSize; ++k) { amplitudeSum += currentAudioSpectrum[k]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(currentAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = audioClip.frequency / 2 / windowSize; for (int l = 0; l < formantArray.Length; ++l) { formantArray[l] = peakPositions[l] * frequencyUnit; } switch (recognizingLanguage) { case ERecognizerLanguage.Japanese: currentVowels = vowelsByFormantJP; currentVowelFormantCeilValues = vowelFormantFloorJP; break; case ERecognizerLanguage.Chinese: currentVowels = vowelsByFormantCN; currentVowelFormantCeilValues = vowelFormantFloorCN; break; } for (int m = 0; m < currentVowelFormantCeilValues.Length; ++m) { if (formantArray[0] > currentVowelFormantCeilValues[m]) { result[i] = currentVowels[m]; } } } else { result[i] = null; } } return(result); }