public string RecognizeByAudioSource(AudioSource audioSource)
        {
            string result = null;

            audioSource.GetOutputData(playingAudioData, 0);
            audioSource.GetSpectrumData(playingAudioSpectrum, 0, FFTWindow.BlackmanHarris);

            if (audioSource.isPlaying == true)
            {
                amplitudeSum = 0.0f;
                for (int i = 0; i < playingAudioSpectrum.Length; ++i)
                {
                    amplitudeSum += playingAudioSpectrum[i];
                }

                if (amplitudeSum >= amplitudeThreshold)
                {
                    MathToolBox.Convolute(playingAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum);
                    MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions);
                    frequencyUnit = audioSource.clip.frequency / 2 / windowSize;
                    for (int i = 0; i < formantArray.Length; ++i)
                    {
                        formantArray[i] = peakPositions[i] * frequencyUnit;
                    }

                    // TODO: Recognization by multiple formant
                    switch (recognizingLanguage)
                    {
                    case ERecognizerLanguage.Japanese:
                        currentVowels = vowelsByFormantJP;
                        currentVowelFormantCeilValues = vowelFormantFloorJP;
                        break;

                    case ERecognizerLanguage.Chinese:
                        currentVowels = vowelsByFormantCN;
                        currentVowelFormantCeilValues = vowelFormantFloorCN;
                        break;
                    }
                    for (int i = 0; i < currentVowelFormantCeilValues.Length; ++i)
                    {
                        if (Average(formantArray) > currentVowelFormantCeilValues[i])
                        {
                            result = currentVowels[i];
                            //Debug.Log(Average(formantArray));
                        }
                    }
                }
                else
                {
                    result = null;
                }
            }
            else
            {
                result = null;
            }

            return(result);
        }
        public LipSyncRuntimeRecognizer(ERecognizerLanguage recognizingLanguage, int windowSize, float amplitudeThreshold)
        {
            this.recognizingLanguage  = recognizingLanguage;
            this.windowSize           = Mathf.ClosestPowerOfTwo(windowSize);
            this.playingAudioData     = new float[this.windowSize];
            this.playingAudioSpectrum = new float[this.windowSize];
            this.amplitudeThreshold   = amplitudeThreshold;
            this.gaussianFilter       = MathToolBox.GenerateGaussianFilter(FILTER_SIZE, FILTER_DEVIATION_SQUARE);

            this.smoothedAudioSpectrum = new float[this.windowSize];
            this.peakValues            = new float[FORMANT_COUNT];
            this.peakPositions         = new int[FORMANT_COUNT];
            this.formantArray          = new float[FORMANT_COUNT];
        }
        /// <summary>
        /// Convolute data and filter. Result is sent to output, which must not be shorter than data.
        /// </summary>
        /// <param name="output">Array to store output. Must not be shorter than data.</param>
        /// <param name="data">Source data array.</param>
        /// <param name="filter">Filter array.</param>
        /// <param name="paddleType">Paddle type.</param>
        public static void Convolute(float[] data, float[] filter, EPaddleType paddleType, float[] output)
        {
            int filterMiddlePoint = Mathf.FloorToInt(filter.Length / 2);

            for (int n = 0; n < data.Length; ++n)
            {
                output[n] = 0.0f;
                for (int m = 0; m < filter.Length; ++m)
                {
                    output[n] +=
                        MathToolBox.GetValueFromArray(data, n - filterMiddlePoint + m, paddleType) *
                        filter[filter.Length - m - 1];
                }
            }
        }
        public LipSyncOfflineRecognizer(ERecognizerLanguage recognizingLanguage, float amplitudeThreshold, int windowSize, int shiftStepSize)
        {
            this.recognizingLanguage = recognizingLanguage;
            this.windowSize          = Mathf.ClosestPowerOfTwo(windowSize);
            this.shiftStepSize       = shiftStepSize;

            this.amplitudeThreshold = amplitudeThreshold;
            this.gaussianFilter     = MathToolBox.GenerateGaussianFilter(FILTER_SIZE, FILTER_DEVIATION_SQUARE);
            this.windowArray        = MathToolBox.GenerateWindow(windowSize, MathToolBox.EWindowType.Hamming);

            this.smoothedAudioSpectrum = new float[this.windowSize];
            this.peakValues            = new float[FORMANT_COUNT];
            this.peakPositions         = new int[FORMANT_COUNT];
            this.formantArray          = new float[FORMANT_COUNT];
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="audioClip"></param>
        /// <returns></returns>
        public string[] RecognizeAllByAudioClip(AudioClip audioClip)
        {
            int recognizeSampleCount = Mathf.CeilToInt((float)(audioClip.samples) / (float)(shiftStepSize));

            string[] result = new string[recognizeSampleCount];

            float[] currentAudioData     = new float[this.windowSize];
            float[] currentAudioSpectrum = new float[this.windowSize];

            for (int i = 0; i < recognizeSampleCount; ++i)
            {
                audioClip.GetData(currentAudioData, i * shiftStepSize);
                for (int j = 0; j < windowSize; ++j)
                {
                    currentAudioData[j] *= windowArray[j];
                }
                currentAudioSpectrum = MathToolBox.DiscreteCosineTransform(currentAudioData);

                amplitudeSum = 0.0f;
                for (int k = 0; k < windowSize; ++k)
                {
                    amplitudeSum += currentAudioSpectrum[k];
                }

                if (amplitudeSum >= amplitudeThreshold)
                {
                    MathToolBox.Convolute(currentAudioSpectrum, gaussianFilter, MathToolBox.EPaddleType.Repeat, smoothedAudioSpectrum);
                    MathToolBox.FindLocalLargestPeaks(smoothedAudioSpectrum, peakValues, peakPositions);
                    frequencyUnit = audioClip.frequency / 2 / windowSize;
                    for (int l = 0; l < formantArray.Length; ++l)
                    {
                        formantArray[l] = peakPositions[l] * frequencyUnit;
                    }

                    switch (recognizingLanguage)
                    {
                    case ERecognizerLanguage.Japanese:
                        currentVowels = vowelsByFormantJP;
                        currentVowelFormantCeilValues = vowelFormantFloorJP;
                        break;

                    case ERecognizerLanguage.Chinese:
                        currentVowels = vowelsByFormantCN;
                        currentVowelFormantCeilValues = vowelFormantFloorCN;
                        break;
                    }
                    for (int m = 0; m < currentVowelFormantCeilValues.Length; ++m)
                    {
                        if (formantArray[0] > currentVowelFormantCeilValues[m])
                        {
                            result[i] = currentVowels[m];
                        }
                    }
                }
                else
                {
                    result[i] = null;
                }
            }

            return(result);
        }