/// <summary> /// Dump the data in the syllable. /// </summary> /// <param name="scriptWord">The script word to store the data dumped from the syllables.</param> /// <param name="utt">The utterance.</param> /// <param name="word">The word which contains the these syllables.</param> /// <param name="phoneIndex">Phone index to mark the phone in the Utt.Phones.</param> /// <param name="unitIndex">Unit index to mark the unit in the Utt.Units.</param> /// <param name="f0StartIndex">F0 index to mark the start position in the F0s.</param> /// <param name="ttsEngine">The object ttsEngine to help to convert the Pos and get sentence id.</param> private static void DumpSyllables(ScriptWord scriptWord, SP.TtsUtterance utt, SP.TtsWord word, ref int phoneIndex, ref int unitIndex, ref int f0StartIndex, SP.TtsEngine ttsEngine) { Debug.Assert(scriptWord != null, "ScriptWord should not be null"); Debug.Assert(utt != null, "Utt should not be null"); Debug.Assert(word != null, "Word should not be null"); Debug.Assert(phoneIndex >= 0, "PhoneIndex should not be less than 0"); Debug.Assert(f0StartIndex >= 0, "f0StartIndex should not be less than 0"); Debug.Assert(ttsEngine != null, "ttsEngine should not be null"); // Go through each syllable in the word. SP.TtsSyllable syllable = word.FirstSyllable; while (syllable != null) { ScriptSyllable scriptSyllable = new ScriptSyllable(); TtsTobiAccentSet tobiAccentSet = new TtsTobiAccentSet(); if (syllable.ToBIAccent != SP.TtsTobiAccent.K_NOACC) { scriptSyllable.TobiPitchAccent = TobiLabel.Create(tobiAccentSet.IdItems[(uint)syllable.ToBIAccent]); } scriptSyllable.Stress = (TtsStress)syllable.Stress; DumpPhones(scriptSyllable, utt, syllable, ref phoneIndex, ref unitIndex, ref f0StartIndex, ttsEngine); scriptWord.Syllables.Add(scriptSyllable); if (syllable == word.LastSyllable) { break; } syllable = syllable.Next; } }
/// <summary> /// Dump the data in the phone. /// </summary> /// <param name="scriptSyllable">The script syllable to store the data dumped from the phones.</param> /// <param name="utt">The utterance.</param> /// <param name="syllable">The syllable which contains these phones.</param> /// <param name="phoneIndex">Phone index to mark the phone in the Utt.Phones.</param> /// <param name="unitIndex">Unit index to mark the unit in the Utt.Units.</param> /// <param name="f0StartIndex">F0 index to mark the start position in the F0s.</param> /// <param name="ttsEngine">The object ttsEngine to help to convert the Pos and get sentence id.</param> private static void DumpPhones(ScriptSyllable scriptSyllable, SP.TtsUtterance utt, SP.TtsSyllable syllable, ref int phoneIndex, ref int unitIndex, ref int f0StartIndex, SP.TtsEngine ttsEngine) { Debug.Assert(scriptSyllable != null, "ScriptSyllable should not be null"); Debug.Assert(utt != null, "Utt should not be null"); Debug.Assert(syllable != null, "Syllable should not be null"); Debug.Assert(phoneIndex >= 0, "PhoneIndex should not be less than 0"); Debug.Assert(f0StartIndex >= 0, "f0StartIndex should not be less than 0"); Debug.Assert(ttsEngine != null, "ttsEngine should not be null"); WuiManager wuiManager = null; if (utt.Segments.Count > 0) { int bestNodeIndex = (int)utt.UnitLattice.WucList[unitIndex].BestNodeIndex; wuiManager = ttsEngine.RUSVoiceDataManager.GetWuiManagerByUnitCostNode(utt.UnitLattice.WucList[unitIndex].WucNodeList[bestNodeIndex]); } // Go through each phone in the syllable. SP.TtsPhone phone = syllable.FirstPhone; while (phone != null) { // Dump the pronunciation of the phone. string phonePronunciation = Pronunciation.RemoveStress(phone.Pronunciation.ToLowerInvariant()).Trim(); // Remove the tone from the phone pronunciation if it exist. if (phone.Tone != 0) { phonePronunciation = Pronunciation.RemoveTone(phonePronunciation).Trim(); } ScriptPhone scriptPhone = new ScriptPhone(phonePronunciation); scriptPhone.Tone = phone.Tone.ToString(); scriptPhone.Stress = (TtsStress)phone.Stress; if (phone.Pronunciation != PronOfSilence) { if (wuiManager != null) { scriptPhone.SentenceId = wuiManager.GetSentenceId(utt.Segments[unitIndex].WaveUnitInfo); } if (phone.Unit != null) { scriptPhone.UnitIndex = (int)phone.Unit.UnitIndex; } } scriptPhone.Acoustics = new ScriptAcoustics(); // Dump the segments. if (utt.Segments.Count > 0 && !utt.Segments[unitIndex].Unit.UnitText.Equals(PronOfSilence) && !utt.Segments[unitIndex].Unit.UnitText.Equals(PronOfShortPause)) { scriptPhone.Acoustics.Duration = (int)utt.Segments[unitIndex].WaveUnitInfo.WaveLength + (int)utt.Segments[unitIndex + 1].WaveUnitInfo.WaveLength; int segStart = (int)utt.Segments[unitIndex].WaveUnitInfo.RecordingWaveStartPosition; int segEnd = segStart + (int)utt.Segments[unitIndex].WaveUnitInfo.WaveLength; scriptPhone.Acoustics.SegmentIntervals.Add(new SegmentInterval(segStart, segEnd)); segStart = (int)utt.Segments[unitIndex + 1].WaveUnitInfo.RecordingWaveStartPosition; segEnd = segStart + (int)utt.Segments[unitIndex + 1].WaveUnitInfo.WaveLength; scriptPhone.Acoustics.SegmentIntervals.Add(new SegmentInterval(segStart, segEnd)); } // Relative begin position of the uvsegment interval. int relativeBegin = 0; // Relative end position of the uvsegment interval. int relativeEnd = 0; // When go through the F0 values, this valuie to identify if meet the first voiced segment. bool reBeginPositionFindOut = false; // Check if all the F0 values in one state are equals to 0. If yes, don't write down the uvseg. bool isF0ValueExist = false; // Dump the durations and F0s in each state. if (utt.Acoustic.Durations != null) { for (int i = 0; i < utt.Acoustic.Durations[phoneIndex].Length; ++i) { ScriptState scriptState = new ScriptState(); // Dump duration int durationInFrame = (int)utt.Acoustic.Durations[phoneIndex][i]; scriptState.Acoustics = new ScriptAcoustics(durationInFrame * MillisecondsPerFrame); // Dump F0s if (utt.Acoustic.F0s != null) { ScriptUvSeg scriptUvSeg = GetF0Contour(utt, f0StartIndex, durationInFrame, ScriptAcousticChunkEncoding.Text, ref relativeBegin, ref relativeEnd, ref reBeginPositionFindOut, ref isF0ValueExist); if (isF0ValueExist == true) { scriptState.Acoustics.UvSegs.Add(scriptUvSeg); } f0StartIndex += durationInFrame; } scriptPhone.States.Add(scriptState); } } // Dump the uvsegment relative interval. if (utt.Acoustic.F0s != null && !phone.Pronunciation.Equals(PronOfSilence) && !phone.Pronunciation.Equals(PronOfShortPause)) { ScriptUvSeg uvSegForRelativeInterval = new ScriptUvSeg(ScriptUvSegType.Mixed); uvSegForRelativeInterval.Interval = new ScriptUvSegInterval(relativeBegin * 5, relativeEnd * 5); scriptPhone.Acoustics.UvSegs.Add(uvSegForRelativeInterval); } phoneIndex++; unitIndex++; if (wuiManager != null && !phone.Pronunciation.Equals(PronOfSilence) && !phone.Pronunciation.Equals(PronOfShortPause)) { // if it is not an silence phone, the according unit must be an half phone unit, // we need skip the right half phone to move next phone's unit unitIndex++; } scriptSyllable.Phones.Add(scriptPhone); if (phone == syllable.LastPhone) { break; } phone = phone.Next; } }
/// <summary> /// Build unit collection for a given syllable. /// </summary> /// <param name="syllable">Syllable to process.</param> /// <returns>Unit collection.</returns> private Collection<TtsUnit> BuildUnits(ScriptSyllable syllable) { string syllableText = Core.Pronunciation.CleanDecorate(syllable.Text.Trim()); string[] slices = PronunciationSeparator.SplitSlices(syllableText); PosInSyllable[] pis = EstimatePosInSyllable(slices); Collection<TtsUnit> units = new Collection<TtsUnit>(); int vowelPhoneCount = 0; for (int sliceIndex = 0; sliceIndex < slices.Length; sliceIndex++) { string slice = slices[sliceIndex].Trim(); if (string.IsNullOrEmpty(slice)) { continue; } TtsUnit unit = new TtsUnit(Language); // break level unit.TtsBreak = (sliceIndex == slices.Length - 1) ? syllable.TtsBreak : TtsBreak.Phone; // pos in syllable unit.Feature.PosInSyllable = pis[sliceIndex]; // NONE: punctuation type // emphasis unit.Feature.TtsEmphasis = syllable.TtsEmphasis; // stress mark unit.Feature.TtsStress = syllable.Stress; // fill unit name // remove stress mark and replace white space with '+' for unit name unit.MetaUnit.Name = Regex.Replace(slice, " +", @"+"); unit.MetaUnit.Language = unit.Language; Phoneme phoneme = Localor.GetPhoneme(unit.Language); foreach (TtsMetaPhone phone in unit.MetaUnit.Phones) { if (phoneme.TtsVowelPhones.IndexOf(phone.Name) >= 0) { vowelPhoneCount++; } } units.Add(unit); } if (vowelPhoneCount > MaxVowelCountInSyllable) { string message = string.Format(CultureInfo.InvariantCulture, "There are more than {0} vowel phone in this syllable [{1}], which is supposed to contain no more than one vowel phone", MaxVowelCountInSyllable, syllable.Text); throw new InvalidDataException(message); } return units; }
/// <summary> /// Insert silence word to script. /// </summary> /// <param name="scriptSentence">Script sentence.</param> /// <param name="wordIndex">To be insert word's position.</param> /// <param name="phoneme">The phoneme string.</param> public static void InsertSilenceWord(ScriptSentence scriptSentence, int wordIndex, string phoneme) { Debug.Assert(Phoneme.IsSilenceFeature(phoneme), "The phoneme should have silence feature"); ScriptWord silenceWord = new ScriptWord(); silenceWord.WordType = WordType.Silence; silenceWord.Pronunciation = Phoneme.ToRuntime(phoneme); silenceWord.Sentence = scriptSentence; ScriptSyllable silenceSyllable = new ScriptSyllable(); silenceSyllable.Word = silenceWord; silenceWord.Syllables.Add(silenceSyllable); ScriptPhone silencePhone = new ScriptPhone(phoneme); silencePhone.Syllable = silenceSyllable; silenceWord.Syllables[0].Phones.Add(silencePhone); scriptSentence.Words.Insert(wordIndex, silenceWord); }
/// <summary> /// Calculate PosInWord feature for a given syllable /// Change it to public for code re-use in script sentence. /// </summary> /// <param name="preSyllable">Previous syllable of target syllable to calculate.</param> /// <param name="syllable">Target syllable to calculate.</param> /// <returns>PosInWord feature.</returns> public static PosInWord CalculatePosInWord(ScriptSyllable preSyllable, ScriptSyllable syllable) { int row = (preSyllable == null || (int)preSyllable.TtsBreak > (int)TtsBreak.Syllable) ? 1 : 0; int column = ((int)syllable.TtsBreak > (int)TtsBreak.Syllable) ? 1 : 0; return _posInWordTrans[row][column]; }
/// <summary> /// Build syllable collection for a given word. /// </summary> /// <param name="word">Word to process.</param> private static void BuildSyllables(ScriptWord word) { if (word == null) { throw new ArgumentNullException("word"); } if (word.Pronunciation == null) { throw new ArgumentException("word.Pronunciation is null"); } word.Syllables.Clear(); string[] syllableTexts = Core.Pronunciation.SplitIntoSyllables(word.Pronunciation); for (int syllableIndex = 0; syllableIndex < syllableTexts.Length; syllableIndex++) { ScriptSyllable syllable = new ScriptSyllable(); syllable.Text = syllableTexts[syllableIndex]; syllable.TtsBreak = (syllableIndex == syllableTexts.Length - 1) ? word.Break : TtsBreak.Syllable; syllable.Stress = Core.Pronunciation.GetStress(syllable.Text); syllable.TtsEmphasis = (syllable.Stress != TtsStress.None) ? word.Emphasis : TtsEmphasis.None; word.Syllables.Add(syllable); } }
/// <summary> /// Build units from syllable. /// </summary> /// <param name="syllable">Syllable.</param> /// <param name="sliceData">Slice data.</param> /// <param name="pronunciationSeparator">Pronunciation separator.</param> /// <returns>Units.</returns> private static Collection<TtsUnit> BuildUnitsForSyllable(ScriptSyllable syllable, SliceData sliceData, PronunciationSeparator pronunciationSeparator) { Debug.Assert(syllable != null); Debug.Assert(sliceData != null); string syllableText = Core.Pronunciation.RemoveStress(syllable.Text.Trim()); string[] slices = pronunciationSeparator.SplitSlices(syllableText); PosInSyllable[] pis = EstimatePosInSyllable(slices, sliceData); Collection<TtsUnit> units = new Collection<TtsUnit>(); for (int sliceIndex = 0; sliceIndex < slices.Length; sliceIndex++) { string slice = slices[sliceIndex].Trim(); if (string.IsNullOrEmpty(slice)) { continue; } TtsUnit unit = new TtsUnit(sliceData.Language); // break level unit.TtsBreak = (sliceIndex == slices.Length - 1) ? syllable.TtsBreak : TtsBreak.Phone; // pos in syllable unit.Feature.PosInSyllable = pis[sliceIndex]; // NONE: punctuation type // emphasis unit.Feature.TtsEmphasis = syllable.TtsEmphasis; // stress mark unit.Feature.TtsStress = syllable.Stress; // fill unit name // remove stress mark and replace white space with '+' for unit name unit.MetaUnit.Name = Regex.Replace(slice, " +", @"+"); unit.MetaUnit.Language = unit.Language; units.Add(unit); } return units; }
/// <summary> /// Update the syllables for the word. /// </summary> private void UpdateUnitSyllables() { if (Pronunciation == null) { throw new InvalidDataException(Helper.NeutralFormat("word {0}'s has no pronunciation", Grapheme)); } string[] syllableTexts = Core.Pronunciation.SplitIntoSyllables(Pronunciation); UnitSyllables.Clear(); for (int syllableIndex = 0; syllableIndex < syllableTexts.Length; syllableIndex++) { ScriptSyllable syllable = new ScriptSyllable(); syllable.Text = syllableTexts[syllableIndex]; syllable.TtsBreak = (syllableIndex == syllableTexts.Length - 1) ? Break : TtsBreak.Syllable; syllable.Stress = Core.Pronunciation.GetStress(syllable.Text); syllable.TtsEmphasis = (syllable.Stress != TtsStress.None) ? Emphasis : TtsEmphasis.None; UnitSyllables.Add(syllable); } }
/// <summary> /// Load syllable from XmlTextReader. /// </summary> /// <param name="reader">XmlTextReader.</param> /// <param name="language">The language of the script.</param> /// <returns>ScriptSyllable.</returns> private static ScriptSyllable LoadSyllable(XmlTextReader reader, Language language) { Debug.Assert(reader != null); ScriptSyllable syllable = new ScriptSyllable(language); // load attributes string stress = reader.GetAttribute("stress"); if (!string.IsNullOrEmpty(stress)) { syllable.Stress = ScriptSyllable.StringToStress(stress); } string tobipa = reader.GetAttribute("tobipa"); syllable.TobiPitchAccent = TobiLabel.Create(tobipa); // load phone if (!reader.IsEmptyElement) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "phs") { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "ph") { ScriptPhone phone = LoadPhone(reader); phone.Syllable = syllable; syllable.Phones.Add(phone); } else if (reader.NodeType == XmlNodeType.EndElement && reader.Name == "phs") { break; } } } else if (reader.NodeType == XmlNodeType.Element && reader.Name == "acoustics") { syllable.Acoustics = new ScriptAcoustics(); syllable.Acoustics.ParseFromXml(reader); } else if (reader.NodeType == XmlNodeType.EndElement && reader.Name == "syl") { break; } } } return syllable; }
/// <summary> /// Parsing the syllable string to a script syllable /// Here we suppose syllable is a valid pronunciation string. /// </summary> /// <param name="syllable">Syllable string, doesn't include unit boundary.</param> /// <param name="phoneSet">TtsPhoneSet.</param> /// <returns>The constructed script syllable.</returns> public static ScriptSyllable ParseStringToSyllable(string syllable, TtsPhoneSet phoneSet) { if (string.IsNullOrEmpty(syllable)) { throw new ArgumentNullException("syllable"); } if (phoneSet == null) { throw new ArgumentNullException("phoneSet"); } ScriptSyllable scriptSyllable = new ScriptSyllable(phoneSet.Language); ErrorSet errors = new ErrorSet(); Phone[] phones = Pronunciation.SplitIntoPhones(syllable, phoneSet, errors); if (errors.Count > 0) { string message = Helper.NeutralFormat( "The syllable string [{0}] isn't valid : {1}{2}", syllable, Environment.NewLine, errors.ErrorsString()); throw new InvalidDataException(message); } Collection<ScriptPhone> scriptPhones = new Collection<ScriptPhone>(); foreach (Phone phone in phones) { if (phone.HasFeature(PhoneFeature.MainStress) || phone.HasFeature(PhoneFeature.SubStress)) { switch (phone.Name) { case "1": scriptSyllable.Stress = TtsStress.Primary; break; case "2": scriptSyllable.Stress = TtsStress.Secondary; break; case "3": scriptSyllable.Stress = TtsStress.Tertiary; break; } } else if (phone.HasFeature(PhoneFeature.Tone)) { scriptPhones[scriptPhones.Count - 1].Tone = phone.Name; } else { ScriptPhone scriptPhone = new ScriptPhone(phone.Name); scriptPhone.Syllable = scriptSyllable; scriptPhones.Add(scriptPhone); } } scriptSyllable.Phones.Clear(); Helper.AppendCollection(scriptSyllable.Phones, scriptPhones); return scriptSyllable; }
/// <summary> /// Convert phones of TTS unit to SR phones. /// </summary> /// <param name="syllable">Syllable to be processed.</param> /// <returns>SR phone array.</returns> private string[] ConvertToSrPhone(ScriptSyllable syllable) { string syllableText = Pronunciation.CleanDecorate(syllable.Text.Trim()); // Map phone to Speech Recognition phone(s) string[] srPhones = Phoneme.Tts2SrPhones(syllableText.Trim()); if (srPhones == null) { string message = string.Format(CultureInfo.InvariantCulture, "Invalid TTS syllable[{0}], which can not be converted to Speech Recognition Phone.", syllableText); throw new InvalidDataException(message); } return srPhones; }
/// <summary> /// Build one syllable for mono MLF file. /// </summary> /// <param name="writer">Text writer to save MLF file.</param> /// <param name="syllable">Syllable.</param> private void BuildMonoMlf(TextWriter writer, ScriptSyllable syllable) { string[] srPhones = ConvertToSrPhone(syllable); foreach (string srPhone in srPhones) { if (writer != null) { writer.WriteLine(srPhone); } } }
/// <summary> /// Build mlf from syllable. /// </summary> /// <param name="syllable">Syllable.</param> /// <param name="item">Script item.</param> /// <param name="sw">Text writer.</param> /// <param name="writeToFile">Whethe writing to file.</param> /// <param name="phoneme">Phoneme.</param> /// <returns>Errors.</returns> private static ErrorSet BuildMonoMlf(ScriptSyllable syllable, ScriptItem item, StreamWriter sw, bool writeToFile, Phoneme phoneme) { Debug.Assert(syllable != null); Debug.Assert(item != null); ErrorSet errors = new ErrorSet(); string syllableText = Pronunciation.RemoveStress(syllable.Text.Trim()); string[] srPhones = phoneme.Tts2SrPhones(syllableText.Trim()); if (srPhones == null) { string message = string.Format(CultureInfo.InvariantCulture, "Invalid TTS syllable[{0}], which can not be converted to Speech Recognition Phone.", syllableText); errors.Add(ScriptError.OtherErrors, item.Id, message); } if (writeToFile && srPhones != null) { foreach (string phone in srPhones) { sw.WriteLine(phone); } } return errors; }