public string RecognizeByAudioSource(AudioSource audioSource) { string result = null; audioSource.GetOutputData(playingAudioData, 0); audioSource.GetSpectrumData(playingAudioSpectrum, 0, FFTWindow.BlackmanHarris); if (audioSource.isPlaying == true) { amplitudeSum = 0.0f; for (int i = 0; i < playingAudioSpectrum.Length; ++i) { amplitudeSum += playingAudioSpectrum[i]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(playingAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = audioSource.clip.frequency / 2 / windowSize; for (int i = 0; i < formantArray.Length; ++i) { formantArray[i] = peakPositions[i] * frequencyUnit; } // TODO: Recognization by multiple formant switch (recognizingLanguage) { case ERecognizerLanguage.Japanese: currentVowels = vowelsByFormantJP; currentVowelFormantCeilValues = vowelFormantFloorJP; break; case ERecognizerLanguage.Chinese: currentVowels = vowelsByFormantCN; currentVowelFormantCeilValues = vowelFormantFloorCN; break; } for (int i = 0; i < currentVowelFormantCeilValues.Length; ++i) { if (Average(formantArray) > currentVowelFormantCeilValues[i]) { result = currentVowels[i]; //Debug.Log(Average(formantArray)); } } } else { result = null; } } else { result = null; } return(result); }
/// <summary> /// /// </summary> /// <param name="audioClip"></param> /// <returns></returns> public string[] RecognizeAllByAudioClip(AudioClip audioClip) { int recognizeSampleCount = Mathf.CeilToInt((float)(audioClip.samples) / (float)(shiftStepSize)); string[] result = new string[recognizeSampleCount]; float[] currentAudioData = new float[this.windowSize]; float[] currentAudioSpectrum = new float[this.windowSize]; for (int i = 0; i < recognizeSampleCount; ++i) { audioClip.GetData(currentAudioData, i * shiftStepSize); for (int j = 0; j < windowSize; ++j) { currentAudioData[j] *= windowArray[j]; } currentAudioSpectrum = MathToolBox.DiscreteCosineTransform(currentAudioData); amplitudeSum = 0.0f; for (int k = 0; k < windowSize; ++k) { amplitudeSum += currentAudioSpectrum[k]; } if (amplitudeSum >= amplitudeThreshold) { MathToolBox.Convolute(currentAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum); MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions); frequencyUnit = audioClip.frequency / 2 / windowSize; for (int l = 0; l < formantArray.Length; ++l) { formantArray[l] = peakPositions[l] * frequencyUnit; } switch (recognizingLanguage) { case ERecognizerLanguage.Japanese: currentVowels = vowelsByFormantJP; currentVowelFormantCeilValues = vowelFormantFloorJP; break; case ERecognizerLanguage.Chinese: currentVowels = vowelsByFormantCN; currentVowelFormantCeilValues = vowelFormantFloorCN; break; } for (int m = 0; m < currentVowelFormantCeilValues.Length; ++m) { if (formantArray[0] > currentVowelFormantCeilValues[m]) { result[i] = currentVowels[m]; } } } else { result[i] = null; } } return(result); }