/// <summary> /// Read all tts unit from Unit Linguistic FeatureVector file. /// </summary> /// <param name="filePath">Unit Linguistic FeatureVector file.</param> /// <param name="language">Language of the unit file.</param> /// <returns>Unit dictionary, indexing by (sentence id + index in sentence).</returns> public static Dictionary<string, TtsUnit> ReadAllData(string filePath, Language language) { Dictionary<string, TtsUnit> units = new Dictionary<string, TtsUnit>(); using (StreamReader sr = new StreamReader(filePath)) { string line = null; while (string.IsNullOrEmpty(line = sr.ReadLine()) != true) { TtsUnit unit = new TtsUnit(language); string[] items = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); unit.Feature = new TtsUnitFeature(); unit.Feature.Parse(items, 2); unit.MetaUnit = new TtsMetaUnit(language); unit.MetaUnit.Name = items[items.Length - 1]; string key = items[0] + " " + items[1]; units.Add(key, unit); } } return units; }
/// <summary> /// Calculate PosInSyllable feature for a given unit /// Change it to public for code re-use in script sentence. /// </summary> /// <param name="preUnit">Previous unit of target unit to calculate.</param> /// <param name="unit">Target unit to calculate.</param> /// <returns>PosInSyllable feature.</returns> public static PosInSyllable CalculatePosInSyllable(TtsUnit preUnit, TtsUnit unit) { PosInSyllable pis = PosInSyllable.Coda; if (unit.Feature.PosInSyllable == PosInSyllable.Onset) { pis = PosInSyllable.Onset; if (preUnit != null && (int)preUnit.TtsBreak <= (int)TtsBreak.Phone) { pis = PosInSyllable.OnsetNext; } } else if (unit.Feature.PosInSyllable == PosInSyllable.OnsetNext) { pis = PosInSyllable.OnsetNext; } else if (unit.Feature.PosInSyllable == PosInSyllable.Coda) { pis = PosInSyllable.Coda; if ((int)unit.TtsBreak <= (int)TtsBreak.Phone) { pis = PosInSyllable.CodaNext; } } else if (unit.Feature.PosInSyllable == PosInSyllable.CodaNext) { pis = PosInSyllable.CodaNext; } else if (unit.Feature.PosInSyllable == PosInSyllable.NucleusInV || unit.Feature.PosInSyllable == PosInSyllable.NucleusInVC || unit.Feature.PosInSyllable == PosInSyllable.NucleusInCV || unit.Feature.PosInSyllable == PosInSyllable.NucleusInCVC) { if (preUnit != null && (int)preUnit.TtsBreak <= (int)TtsBreak.Phone) { if ((int)unit.TtsBreak <= (int)TtsBreak.Phone) { pis = PosInSyllable.NucleusInCVC; } else { pis = PosInSyllable.NucleusInCV; } } else { if ((int)unit.TtsBreak <= (int)TtsBreak.Phone) { pis = PosInSyllable.NucleusInVC; } else { pis = PosInSyllable.NucleusInV; } } } if (unit.MetaUnit.Special) { pis = PosInSyllable.Onset; } return pis; }
/// <summary> /// Build unit collection for a given syllable. /// </summary> /// <param name="syllable">Syllable to process.</param> /// <returns>Unit collection.</returns> private Collection<TtsUnit> BuildUnits(ScriptSyllable syllable) { string syllableText = Core.Pronunciation.CleanDecorate(syllable.Text.Trim()); string[] slices = PronunciationSeparator.SplitSlices(syllableText); PosInSyllable[] pis = EstimatePosInSyllable(slices); Collection<TtsUnit> units = new Collection<TtsUnit>(); int vowelPhoneCount = 0; for (int sliceIndex = 0; sliceIndex < slices.Length; sliceIndex++) { string slice = slices[sliceIndex].Trim(); if (string.IsNullOrEmpty(slice)) { continue; } TtsUnit unit = new TtsUnit(Language); // break level unit.TtsBreak = (sliceIndex == slices.Length - 1) ? syllable.TtsBreak : TtsBreak.Phone; // pos in syllable unit.Feature.PosInSyllable = pis[sliceIndex]; // NONE: punctuation type // emphasis unit.Feature.TtsEmphasis = syllable.TtsEmphasis; // stress mark unit.Feature.TtsStress = syllable.Stress; // fill unit name // remove stress mark and replace white space with '+' for unit name unit.MetaUnit.Name = Regex.Replace(slice, " +", @"+"); unit.MetaUnit.Language = unit.Language; Phoneme phoneme = Localor.GetPhoneme(unit.Language); foreach (TtsMetaPhone phone in unit.MetaUnit.Phones) { if (phoneme.TtsVowelPhones.IndexOf(phone.Name) >= 0) { vowelPhoneCount++; } } units.Add(unit); } if (vowelPhoneCount > MaxVowelCountInSyllable) { string message = string.Format(CultureInfo.InvariantCulture, "There are more than {0} vowel phone in this syllable [{1}], which is supposed to contain no more than one vowel phone", MaxVowelCountInSyllable, syllable.Text); throw new InvalidDataException(message); } return units; }
/// <summary> /// Save information of one unit into log file. /// </summary> /// <param name="writer">Stream writer to save the information.</param> /// <param name="index">Index of the unit to save.</param> /// <param name="expectedUnit">Expected unit from front-end.</param> /// <param name="selectedNode">Selected node through unit selection.</param> /// <returns>Next unit index.</returns> private static int SaveUnit(StreamWriter writer, int index, TtsUnit expectedUnit, CostNode selectedNode) { writer.Write(index.ToString(CultureInfo.InvariantCulture) + " "); ++index; writer.Write(index.ToString(CultureInfo.InvariantCulture) + " "); writer.Write(expectedUnit.MetaUnit.Id.ToString(CultureInfo.InvariantCulture) + " "); writer.Write(selectedNode.WaveUnit.SampleOffset.ToString(CultureInfo.InvariantCulture) + " "); writer.Write(selectedNode.WaveUnit.SampleLength.ToString(CultureInfo.InvariantCulture) + " "); TtsUnitFeature selFea = selectedNode.WaveUnit.Features; if (selFea != null) { StringBuilder builder = new StringBuilder(); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/ {1} ", (int)selFea.PosInSentence, (int)expectedUnit.Feature.PosInSentence); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.PosInWord, (int)expectedUnit.Feature.PosInWord); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.PosInSyllable, (int)expectedUnit.Feature.PosInSyllable); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.LeftContextPhone, (int)expectedUnit.Feature.LeftContextPhone); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.RightContextPhone, (int)expectedUnit.Feature.RightContextPhone); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.LeftContextTone, (int)expectedUnit.Feature.LeftContextTone); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.RightContextTone, (int)expectedUnit.Feature.RightContextTone); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/ {1} ", (int)selFea.TtsStress, (int)expectedUnit.Feature.TtsStress); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.TtsEmphasis, (int)expectedUnit.Feature.TtsEmphasis); builder.AppendFormat(CultureInfo.InvariantCulture, "{0}/{1} ", (int)selFea.TtsWordTone, (int)expectedUnit.Feature.TtsWordTone); writer.Write(builder.ToString()); } writer.Write("\r\n"); return index; }
private static TtsUnit ParseTtsUnit(string line, Language language) { TtsUnit unit = new TtsUnit(language); string[] items = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); unit.Feature = new TtsUnitFeature(); unit.Feature.Parse(items, 0); unit.MetaUnit = ParseMetaUnitForViterbi(line); return unit; }
/// <summary> /// Build units from syllable. /// </summary> /// <param name="syllable">Syllable.</param> /// <param name="sliceData">Slice data.</param> /// <param name="pronunciationSeparator">Pronunciation separator.</param> /// <returns>Units.</returns> private static Collection<TtsUnit> BuildUnitsForSyllable(ScriptSyllable syllable, SliceData sliceData, PronunciationSeparator pronunciationSeparator) { Debug.Assert(syllable != null); Debug.Assert(sliceData != null); string syllableText = Core.Pronunciation.RemoveStress(syllable.Text.Trim()); string[] slices = pronunciationSeparator.SplitSlices(syllableText); PosInSyllable[] pis = EstimatePosInSyllable(slices, sliceData); Collection<TtsUnit> units = new Collection<TtsUnit>(); for (int sliceIndex = 0; sliceIndex < slices.Length; sliceIndex++) { string slice = slices[sliceIndex].Trim(); if (string.IsNullOrEmpty(slice)) { continue; } TtsUnit unit = new TtsUnit(sliceData.Language); // break level unit.TtsBreak = (sliceIndex == slices.Length - 1) ? syllable.TtsBreak : TtsBreak.Phone; // pos in syllable unit.Feature.PosInSyllable = pis[sliceIndex]; // NONE: punctuation type // emphasis unit.Feature.TtsEmphasis = syllable.TtsEmphasis; // stress mark unit.Feature.TtsStress = syllable.Stress; // fill unit name // remove stress mark and replace white space with '+' for unit name unit.MetaUnit.Name = Regex.Replace(slice, " +", @"+"); unit.MetaUnit.Language = unit.Language; units.Add(unit); } return units; }
/// <summary> /// Read and parse unit data from the XML text reader to utterance. /// </summary> /// <param name="reader">XML text reader to read data from.</param> /// <param name="utterance">Target utterance to save result units.</param> private static void ParseUnit(XmlTextReader reader, TtsUtterance utterance) { TtsUnit unit = new TtsUnit(utterance.Script.Language); unit.MetaUnit.Name = reader.GetAttribute("val"); if (reader.GetAttribute("iSyll") != null) { unit.Feature.PosInSyllable = (PosInSyllable)Enum.Parse(typeof(PosInSyllable), reader.GetAttribute("iSyll")); } if (reader.GetAttribute("iWord") != null) { unit.Feature.PosInWord = (PosInWord)Enum.Parse(typeof(PosInWord), reader.GetAttribute("iWord")); } if (reader.GetAttribute("iSent") != null) { unit.Feature.PosInSentence = (PosInSentence)Enum.Parse(typeof(PosInSentence), reader.GetAttribute("iSent")); } Phoneme phoneme = Localor.GetPhoneme(utterance.Script.Language, utterance.Script.Engine); unit.Feature.LeftContextPhone = phoneme.TtsPhone2Id(reader.GetAttribute("lPh")); unit.Feature.RightContextPhone = phoneme.TtsPhone2Id(reader.GetAttribute("rPh")); if (reader.GetAttribute("em") != null) { unit.Feature.TtsEmphasis = (TtsEmphasis)Enum.Parse(typeof(TtsEmphasis), reader.GetAttribute("em")); } if (reader.GetAttribute("st") != null) { unit.Feature.TtsStress = (TtsStress)Enum.Parse(typeof(TtsStress), reader.GetAttribute("st")); } utterance.Script.Units.Add(unit); }
/// <summary> /// Convert phones of TTS unit to SR phones. /// </summary> /// <param name="unit">TtsUnit to be processed.</param> /// <returns>SR phone array.</returns> private string[] ConvertToSrPhone(TtsUnit unit) { List<string> retPhones = new List<string>(); // Go through each phone in this unit foreach (TtsMetaPhone phone in unit.MetaUnit.Phones) { // Map phone to Speech Recognition phone(s) string[] srPhones = Phoneme.Tts2SrPhones(phone.Name); if (srPhones == null) { string message = string.Format(CultureInfo.InvariantCulture, "Invalid TTS phone[{0}], which can not be converted to Speech Recognition Phone.", phone.Name); throw new InvalidDataException(message); } retPhones.AddRange(srPhones); } return retPhones.ToArray(); }
/// <summary> /// Build one unit for mono MLF file. /// </summary> /// <param name="writer">Text writer to save MLF file.</param> /// <param name="unit">Unit.</param> private void BuildMonoMlf(TextWriter writer, TtsUnit unit) { string[] srPhones = ConvertToSrPhone(unit); foreach (string srPhone in srPhones) { if (writer != null) { writer.WriteLine(srPhone); } } }
/// <summary> /// Build mlf from unit. /// </summary> /// <param name="unit">Unit.</param> /// <param name="item">Script item.</param> /// <param name="sw">Text writer.</param> /// <param name="writeToFile">Whethe writing to file.</param> /// <param name="phoneme">Phoneme.</param> /// <returns>Errors.</returns> private static ErrorSet BuildMonoMlf(TtsUnit unit, ScriptItem item, StreamWriter sw, bool writeToFile, Phoneme phoneme) { Debug.Assert(unit != null); Debug.Assert(item != null); ErrorSet errors = new ErrorSet(); List<string> allPhones = new List<string>(); foreach (TtsMetaPhone phone in unit.MetaUnit.Phones) { string[] srPhones = phoneme.Tts2SrPhones(phone.Name); if (srPhones == null) { string message = string.Format(CultureInfo.InvariantCulture, "Invalid TTS phone[{0}], which can not be converted to Speech Recognition Phone.", phone.Name); errors.Add(ScriptError.OtherErrors, item.Id, message); continue; } allPhones.AddRange(srPhones); } if (writeToFile) { foreach (string phone in allPhones) { sw.WriteLine(phone); } } return errors; }