예제 #1
0
        /// <summary>
        /// Get utterance generated by ESP.
        /// </summary>
        /// <param name="content">Content to be spoken.</param>
        /// <param name="sayas">Sayas used by ESP.</param>
        /// <returns>Utterance enum.</returns>
        public IEnumerable<SP.TtsUtterance> EspUtterances(string content, string sayas)
        {
            if (_engine == null)
            {
                throw new ArgumentNullException("_engine");
            }

            if (string.IsNullOrEmpty(content))
            {
                throw new ArgumentNullException("content");
            }

            if (string.IsNullOrEmpty(sayas))
            {
                _engine.SetSpeakText(content);
            }
            else
            {
                _engine.SetSpeakText(content, sayas);
            }

            if ((_mode & ProcessMode.TextProcess) != 0)
            {
                _engine.TextProcessor.Reset();
            }

            if ((_mode & ProcessMode.ProsodyTag) != 0)
            {
                _engine.LinguisticProsodyTagger.Reset();
            }

            if ((_mode & ProcessMode.UnitGenerate) != 0)
            {
                _engine.UnitGenerator.Reset();
            }

            if ((_mode & ProcessMode.UnitLatticeGenerate) != 0)
            {
                _engine.UnitLatticeGenerator.Reset();
            }

            if ((_mode & ProcessMode.UnitSelect) != 0)
            {
                _engine.UnitSelector.Reset();
            }

            if ((_mode & ProcessMode.WaveGenerate) != 0)
            {
                _engine.WaveGenerator.Reset();
            }

            while (true)
            {
                SP.TtsUtterance utterance = new SP.TtsUtterance();

                if ((_mode & ProcessMode.TextProcess) != 0 &&
                    !_engine.TextProcessor.Process(utterance))
                {
                    break;
                }

                if ((_mode & ProcessMode.ProsodyTag) != 0 &&
                    !_engine.LinguisticProsodyTagger.Process(utterance))
                {
                    break;
                }

                if ((_mode & ProcessMode.UnitGenerate) != 0 &&
                    !_engine.UnitGenerator.Process(utterance))
                {
                    break;
                }

                if ((_mode & ProcessMode.UnitLatticeGenerate) != 0 &&
                    !_engine.UnitLatticeGenerator.Process(utterance))
                {
                    break;
                }

                if ((_mode & ProcessMode.UnitSelect) != 0 &&
                    !_engine.UnitSelector.Process(utterance))
                {
                    break;
                }

                if ((_mode & ProcessMode.WaveGenerate) != 0 &&
                    !_engine.WaveGenerator.Process(utterance))
                {
                    break;
                }

                yield return utterance;
            }
        }
        /// <summary>
        /// Appends a normal word in the end of given utterance.
        /// </summary>
        /// <param name="utterance">
        /// The given utterance.
        /// </param>
        /// <param name="scriptWord">
        /// The script word.
        /// </param>
        /// <returns>
        /// The phoneme count of the given word.
        /// </returns>
        /// <exception cref="InvalidDataException">
        /// Exception.
        /// </exception>
        private int AppendNormalWord(TtsUtterance utterance, ScriptWord scriptWord)
        {
            TtsWord word = utterance.AppendNewWord();
            word.LangId = (ushort)scriptWord.Language;
            word.BreakLevel = (TtsBreakLevel)scriptWord.Break;
            word.Emphasis = (TtsEmphasis)scriptWord.Emphasis;
            word.WordText = scriptWord.Grapheme;
            word.NETypeText = scriptWord.NETypeText;
            word.WordRegularText = scriptWord.RegularText;
            word.WordType = TtsWordType.WT_NORMAL;
            word.AcousticDomain = DomainExtension.MapToEnum(scriptWord.AcousticDomainTag);
            word.WordExpansion = scriptWord.Expansion;
            word.ReadablePronunciation = scriptWord.Pronunciation;
            if (!string.IsNullOrEmpty(scriptWord.Pronunciation))
            {
                word.PhoneIds = Phoneme.PronunciationToPhoneIds(Pronunciation.RemoveUnitBoundary(scriptWord.Pronunciation));
            }

            if (NeedPos)
            {
                // Checks pos.
                if (string.IsNullOrEmpty(scriptWord.PosString))
                {
                    throw new InvalidDataException(
                        Helper.NeutralFormat("No POS found in sentence \"{0}\" for word \"{1}\"",
                            scriptWord.Sentence.ScriptItem.Id, scriptWord.Grapheme));
                }

                // Sets pos value.
                word.Pos = (ushort)PosSet.Items[scriptWord.PosString];
                string taggingPos = PosSet.CategoryTaggingPOS[scriptWord.PosString];
                word.POSTaggerPos = (ushort)PosSet.Items[taggingPos];
            }

            // Gets the normal phoneme count.
            ErrorSet errorSet = new ErrorSet();
            int count = scriptWord.GetNormalPhoneNames(PhoneSet, errorSet).Count;
            if (errorSet.Count > 0)
            {
                throw new InvalidDataException(
                    Helper.NeutralFormat("Invalid phone found in sentence \"{0}\" for word \"{1}\"",
                        scriptWord.Sentence.ScriptItem.Id, scriptWord.Grapheme));
            }

            word.TextOffset = (uint)scriptWord.OffsetInString;
            word.TextLength = (uint)scriptWord.LengthInString;

            return count;
        }
        /// <summary>
        /// Appends a punctuation word in the end of given utterance.
        /// </summary>
        /// <param name="utterance">
        /// The given utterance.
        /// </param>
        /// <param name="scriptWord">
        /// The script word.
        /// </param>
        /// <returns>
        /// The phoneme count of the given word.
        /// </returns>
        /// <exception cref="InvalidDataException">
        /// Exception.
        /// </exception>
        private int AppendPunctuationWord(TtsUtterance utterance, ScriptWord scriptWord)
        {
            TtsWord word = utterance.AppendNewWord();
            word.LangId = (ushort)scriptWord.Language;
            word.BreakLevel = (TtsBreakLevel)scriptWord.Break;
            word.Emphasis = (TtsEmphasis)scriptWord.Emphasis;
            word.WordText = scriptWord.Grapheme;
            word.NETypeText = scriptWord.NETypeText;
            word.WordType = TtsWordType.WT_PUNCTUATION;

            // There is no phoneme for punctuation word.
            return 0;
        }
        /// <summary>
        /// Appends a silence word in then end of given utterance.
        /// </summary>
        /// <param name="utterance">
        /// The given utterance.
        /// </param>
        /// <param name="phone">
        /// The phone name.
        /// </param>
        /// <returns>
        /// The phoneme count of the silence word.
        /// </returns>
        private int AppendSilenceWord(TtsUtterance utterance, string phone)
        {
            Debug.Assert(Offline.Phoneme.IsSilenceFeature(phone), "Silence word should have phone: short pause or silence");

            TtsWord word = utterance.AppendNewWord();
            word.PhoneIds = Phoneme.PronunciationToPhoneIds(Offline.Phoneme.ToRuntime(phone));
            word.LangId = (ushort)PhoneSet.Language;
            word.WordType = TtsWordType.WT_SILENCE;
            word.Pos = 0;

            // Modify the silence word's break level to make it consistent with runtime engine
            if (word.Previous != null)
            {
                word.BreakLevel = word.Previous.BreakLevel;
            }
            else
            {
                word.BreakLevel = TtsBreakLevel.BK_IDX_SENTENCE;
            }

            return 1;
        }
        public TtsUtterance Build(ScriptItem item, SegmentFile segmentFile, bool buildAllWords, int subSentenceIndex)
        {
            Helper.ThrowIfNull(item);

            TtsUtterance utterance = new TtsUtterance();
            int phoneIndex = 0;
            try
            {
                // Silence indicates a silence word.
                if (segmentFile != null &&
                    segmentFile.WaveSegments[phoneIndex].IsSilenceFeature)
                {
                    phoneIndex += AppendSilenceWord(utterance, segmentFile.WaveSegments[phoneIndex].Label);
                }

                // Creates a words map for ToBI accent.
                Dictionary<ScriptWord, TtsWord> mapWords = new Dictionary<ScriptWord, TtsWord>();

                int sentenceIndex = 0;
                foreach (ScriptSentence scriptSentence in item.Sentences)
                {
                    // Only add certain sentence in the scriptItem.
                    if (subSentenceIndex != -1 && sentenceIndex++ != subSentenceIndex)
                    {
                        continue;
                    }

                    // Treats unkown sentence type as declarative.
                    if (scriptSentence.SentenceType != SentenceType.Unknown)
                    {
                        utterance.SentenceType = (TtsSentenceType)scriptSentence.SentenceType;
                    }
                    else
                    {
                        utterance.SentenceType = (TtsSentenceType)SentenceType.Declarative;
                    }

                    utterance.SentenceEmotionType = (EmotionmlCategory)scriptSentence.Emotion;

                    // Converts each word in script sentence.
                    foreach (ScriptWord scriptWord in scriptSentence.Words)
                    {
                        if (buildAllWords || scriptWord.IsPronouncableNormalWord)
                        {
                            phoneIndex += AppendNormalWord(utterance, scriptWord);

                            // Adds into words map.
                            mapWords.Add(scriptWord, utterance.Words[utterance.Words.Count - 1]);

                            // Breaks if meets the end of the utterance.
                            if (segmentFile != null &&
                                phoneIndex >= segmentFile.WaveSegments.Count)
                            {
                                break;
                            }

                            if (segmentFile != null &&
                                segmentFile.WaveSegments[phoneIndex].IsSilenceFeature)
                            {
                                phoneIndex += AppendSilenceWord(utterance, segmentFile.WaveSegments[phoneIndex].Label);
                            }
                        }
                        else if (buildAllWords || (NeedPunctuation && scriptWord.WordType == WordType.Punctuation))
                        {
                            phoneIndex += AppendPunctuationWord(utterance, scriptWord);
                        }
                    }
                }

                // Builds phone list.
                int[] pauseDurations = new int[(int)TtsPauseLevel.PAU_IDX_SENTENCE + 1];
                Array.Clear(pauseDurations, 0, pauseDurations.Length);
                utterance.BuildPhoneList(Phoneme, pauseDurations, 0, 0);

                // Builds ToBI accent, which should be happened after phone list built.
                BuildToBIInformation(mapWords);

                // Builds phrase list.
                utterance.BuildPhraseList();

                // Builds character list.
                utterance.BuildContextCharacters();

                return utterance;
            }
            catch (EspException e)
            {
                throw new InvalidDataException(
                    Helper.NeutralFormat("Build utterance error on sentence \"{0}\"", item.Id), e);
            }
        }
        /// <summary>
        /// Extracts the features of the given utterance.
        /// </summary>
        /// <param name="sentId">
        /// Sentence id.
        /// </param>
        /// <param name="utterance">
        /// Service Provider utterance object.
        /// </param>
        /// <returns>
        /// The sentence contains all the features.
        /// </returns>
        /// <exception cref="InvalidDataException">
        /// Exception.
        /// </exception>
        public Sentence Extract(string sentId, TtsUtterance utterance)
        {
            List<FeatureVector> vectors;

            try
            {
                // Then, extracts the features.
                vectors = ExtractionEngine.Extract(utterance, FeatureMetas);
            }
            catch (EspException e)
            {
                throw new InvalidDataException(Helper.NeutralFormat("Extract feature error on sentence \"{0}\"",
                    sentId), e);
            }

            // Validates the extracted vectors.
            if (vectors.Count != FeatureMetas.Count)
            {
                throw new InvalidDataException(
                    Helper.NeutralFormat("Length of result is mismatch on sentence \"{0}\"", sentId));
            }

            for (int i = 0; i < vectors.Count; i++)
            {
                if (vectors[i].Count != utterance.Phones.Count)
                {
                    throw new InvalidDataException(
                        Helper.NeutralFormat("Length of vector is mismatch on sentence \"{0}\"", sentId));
                }
            }

            // Creates a sentence to store all the features.
            Sentence sentence = new Sentence { Id = sentId };
            for (int i = 0; i < vectors[0].Count; ++i)
            {
                // Create candidates for each phoneme.
                PhoneSegment p = new PhoneSegment
                {
                    Sentence = sentence,
                    Index = i,
                    Features = vectors.Select(v => v[i])
                        .Skip(LabelFeatureNameSet.MandatoryFeatureNames.Length).ToArray(),
                };

                // Create the label to store the features.
                Label label = new Label(FeatureNameSet);
                for (int j = 0; j < vectors.Count; ++j)
                {
                    if (vectors[j][i].ValueType == FeatureValueType.FEATURE_VALUE_TYPE_UNKOWN)
                    {
                        label.SetFeatureValue(FeatureNameSet.FeatureNames[j], Label.NotApplicableFeatureValue);
                    }
                    else if (FeatureMetas[j].Property == TtsFeatureProperty.TTS_FEATURE_PROPERTY_PHONE_ID)
                    {
                        Phone phone = PhoneSet.GetPhone(vectors[j][i].IntValue);
                        label.SetFeatureValue(FeatureNameSet.FeatureNames[j], Offline.Phoneme.ToHtk(phone.Name));
                    }
                    else
                    {
                        label.SetFeatureValue(FeatureNameSet.FeatureNames[j],
                            vectors[j][i].IntValue.ToString(CultureInfo.InvariantCulture));
                    }

                    // Updates the corresponding value records.
                    FeatureValueRecords[j].Update(vectors[j][i]);
                }

                p.Label = label;
                sentence.PhoneSegments.Add(p);
            }

            return sentence;
        }