/// <summary> /// Validate Lexicon according to pos set and phone set. /// </summary> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="ttsPosSet">TTS POS set.</param> public void Validate(TtsPhoneSet ttsPhoneSet, TtsPosSet ttsPosSet) { if (ttsPosSet == null) { throw new ArgumentNullException("ttsPosSet"); } if (ttsPhoneSet == null) { throw new ArgumentNullException("ttsPhoneSet"); } if (!ttsPosSet.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", ttsPosSet.Language.ToString(), "pos set")); } if (!ttsPhoneSet.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", ttsPhoneSet.Language.ToString(), "phone set")); } Validate(ttsPhoneSet, ttsPosSet, null); }
/// <summary> /// Validate Lexicon according to phone set and lexical attribute schema. /// </summary> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="attributeSchema">TTS attribute schema.</param> public void Validate(TtsPhoneSet ttsPhoneSet, LexicalAttributeSchema attributeSchema) { if (attributeSchema == null) { throw new ArgumentNullException("attributeSchema"); } if (ttsPhoneSet == null) { throw new ArgumentNullException("ttsPhoneSet"); } if (!attributeSchema.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", attributeSchema.Language.ToString(), "lexical attribute Schema")); } if (!ttsPhoneSet.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", ttsPhoneSet.Language.ToString(), "phone set")); } Validate(ttsPhoneSet, null, attributeSchema); }
/// <summary> /// TryParseConditionLine. /// </summary> /// <param name="line">Line.</param> /// <param name="phoneSet">PhoneSet.</param> /// <param name="polyphonyWord">PolyphonyWord.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>Whether the line is condition line.</returns> private bool TryParseConditionLine(string line, TtsPhoneSet phoneSet, PolyphonyRule polyphonyWord, ErrorSet errorSet) { bool isConditionLine = false; if (IsConditionLine(line)) { isConditionLine = true; if (polyphonyWord == null) { errorSet.Add(PolyRuleError.MissKeyValueLine, line); } errorSet.AddRange(ParseConditionLine(line, phoneSet, polyphonyWord)); } return isConditionLine; }
/// <summary> /// Validate Lexicon according to TTS phone set and pos set or lexical attribute schema. /// </summary> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="ttsPosSet">TTS POS set.</param> /// <param name="attributeSchema">Lexical attribute Schema.</param> private void Validate(TtsPhoneSet ttsPhoneSet, TtsPosSet ttsPosSet, LexicalAttributeSchema attributeSchema) { Debug.Assert(ttsPhoneSet != null); Debug.Assert(ttsPosSet != null || attributeSchema != null); bool dependentDataValid = true; ttsPhoneSet.Validate(); if (ttsPhoneSet.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "Phone set"); dependentDataValid = false; } if (ttsPosSet != null) { ttsPosSet.Validate(); if (ttsPosSet.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "POS set"); dependentDataValid = false; } } if (attributeSchema != null) { attributeSchema.Validate(); if (attributeSchema.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "Lexical Attribute Schema"); dependentDataValid = false; } } if (dependentDataValid) { bool containValidItem = false; foreach (LexicalItem lexItem in Items.Values) { ErrorSet errorSet = lexItem.Validate(ttsPhoneSet, ttsPosSet, attributeSchema); ErrorSet.Merge(errorSet); containValidItem = containValidItem || lexItem.Valid; } if (!containValidItem) { ErrorSet.Add(LexiconError.EmptyLexicon); } } validated = true; }
/// <summary> /// ParseConditionLine. /// </summary> /// <param name="line">Line.</param> /// <param name="phoneSet">PhoneSet.</param> /// <param name="polyphonyWord">PolyphonyWord.</param> /// <returns>ErrorSet.</returns> private ErrorSet ParseConditionLine(string line, TtsPhoneSet phoneSet, PolyphonyRule polyphonyWord) { ErrorSet errorSet = new ErrorSet(); Match match = Regex.Match(line, ConditionLineRegex); if (match.Groups.Count < 3) { errorSet.Add(PolyRuleError.InvalidConditionFormat, line); } else { PolyphonyPron polyphonyPron = new PolyphonyPron(); polyphonyPron.Pron = match.Groups[2].ToString().Trim(); // Allow empty pronunciation for polyphony rule. if (!string.IsNullOrEmpty(polyphonyPron.Pron) && phoneSet != null) { errorSet.AddRange(Pronunciation.Validate(polyphonyPron.Pron, phoneSet)); } string conditions = match.Groups[1].ToString().Trim(); bool hasMatched = false; foreach (Match conditionMatch in Regex.Matches(conditions, ConditionRegex)) { hasMatched = true; string expression = conditionMatch.Value; PolyphonyCondition condition = new PolyphonyCondition(); ParsePolyCondition(expression.Trim(), condition, errorSet); polyphonyPron.Conditions.Add(condition); } if (hasMatched) { if (errorSet.GetSeverityCount(ErrorSeverity.MustFix) == 0) { if (polyphonyWord == null) { errorSet.Add(PolyRuleError.MissKeyValueLine, line); } else { polyphonyWord.PolyphonyProns.Add(polyphonyPron); } } } else { errorSet.Add(PolyRuleError.InvalidConditionFormat, line); } } return errorSet; }
/// <summary> /// Load. /// </summary> /// <param name="filePath">FilePath.</param> /// <param name="phoneSet">PhoneSet.</param> /// <returns>ErrorSet.</returns> public ErrorSet Load(string filePath, TtsPhoneSet phoneSet) { // This validation is needed by Fxcop checking parameters. if (phoneSet == null) { phoneSet = null; } if (string.IsNullOrEmpty(filePath)) { throw new ArgumentNullException("filePath"); } if (!File.Exists(filePath)) { throw Helper.CreateException(typeof(FileNotFoundException), filePath); } if (!Helper.IsUnicodeFile(filePath)) { throw new InvalidDataException(Helper.NeutralFormat( "Polyphony rule file [{0}] is not unicode.", filePath)); } ErrorSet errorSet = new ErrorSet(); _keyTypes.Clear(); bool finishReadHead = false; bool firstKeyString = true; PolyphonyRule polyphonyWord = null; int lineNum = 0; string domain = DomainItem.GeneralDomain; foreach (string line in Helper.FileLines(filePath, Encoding.Unicode, false)) { lineNum++; string trimedLine = line.Trim(); if (string.IsNullOrEmpty(trimedLine)) { continue; } if (IsComment(trimedLine)) { continue; } if (IsDomainTag(trimedLine)) { ParseDomainKey(trimedLine, ref domain); continue; } ErrorSet parseErrorSet = new ErrorSet(); if (!finishReadHead) { bool isKeyDeclear = TryParseKeyDeclear(trimedLine, ref firstKeyString, parseErrorSet); AddParseError(errorSet, lineNum, parseErrorSet); if (isKeyDeclear) { continue; } else { finishReadHead = true; } } PolyruleKeys.Instance.KeyTypes = _keyTypes; parseErrorSet.Clear(); bool isKeyLine = TryParseKeyLine(trimedLine, ref polyphonyWord, parseErrorSet, domain); domain = DomainItem.GeneralDomain; AddParseError(errorSet, lineNum, parseErrorSet); if (isKeyLine) { continue; } parseErrorSet.Clear(); bool isConditionLine = TryParseConditionLine(trimedLine, phoneSet, polyphonyWord, parseErrorSet); AddParseError(errorSet, lineNum, parseErrorSet); if (isConditionLine) { continue; } errorSet.Add(PolyRuleError.InvalidLineFormat, lineNum.ToString(CultureInfo.InvariantCulture), trimedLine); } if (polyphonyWord != null) { _polyphonyWords.Add(polyphonyWord); } if (string.IsNullOrEmpty(_keyString)) { errorSet.Add(PolyRuleError.MissPrimaryKey, filePath); } errorSet.AddRange(CheckDupWordDefinitions()); foreach (PolyphonyRule rule in _polyphonyWords) { errorSet.AddRange(rule.CheckDupRuleConditions()); } return errorSet; }
/// <summary> /// Validate the pronunciation for the word. /// </summary> /// <param name="word">Word.</param> /// <param name="lexPron">Lexicon pronunciation.</param> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="errorSet">Error set.</param> private static void ValidatePronunciation(string word, LexiconPronunciation lexPron, TtsPhoneSet ttsPhoneSet, ErrorSet errorSet) { // Validate the pronunciation information ErrorSet pronErrorSet = Pronunciation.Validate(lexPron.Symbolic, ttsPhoneSet); bool invalid = false; foreach (Error error in pronErrorSet.Errors) { errorSet.Add(LexiconError.PronunciationError, error, word); if (error.Severity == ErrorSeverity.MustFix && !(error.Enum.Equals(PronunciationError.VowelAndSonorantCountLessThanMinimum) || error.Enum.Equals(PronunciationError.VowelAndSonorantCountGreaterThanMaximum) || error.Enum.Equals(PronunciationError.VowelCountLessThanMinimum) || error.Enum.Equals(PronunciationError.VowelCountGreaterThanMaximum))) { invalid = true; } } lexPron.Valid = lexPron.Valid && !invalid; }
/// <summary> /// Validate lexicon item. /// </summary> /// <param name="ttsPhoneSet">Phone set to validate lexicon item's pronunciation.</param> /// <param name="ttsPosSet">Pos set of the lexicon item.</param> /// <param name="attributeSchema">Attribute schema.</param> /// <returns>Error set of the validation.</returns> public ErrorSet Validate(TtsPhoneSet ttsPhoneSet, TtsPosSet ttsPosSet, LexicalAttributeSchema attributeSchema) { Debug.Assert(ttsPhoneSet != null); Debug.Assert(ttsPosSet != null || attributeSchema != null); ErrorSet errorSet = new ErrorSet(); // Merge duplicate pronunciation node Collection<LexiconPronunciation> distinctPronunciations = new Collection<LexiconPronunciation>(); Dictionary<string, int> pronunciationIndex = new Dictionary<string, int>(); int pronunciationCount = 0; foreach (LexiconPronunciation lexPron in Pronunciations) { // Validate duplicate pronunciation node if (pronunciationIndex.ContainsKey(lexPron.Symbolic)) { errorSet.Add(LexiconError.DuplicatePronunciationNode, Grapheme, lexPron.Symbolic); lexPron.Valid = false; foreach (LexiconItemProperty property in lexPron.Properties) { Collection<LexiconItemProperty> targetProperties = distinctPronunciations[pronunciationIndex[lexPron.Symbolic]].Properties; if (!targetProperties.Contains(property)) { targetProperties.Add(property); } else { errorSet.Add(LexiconError.DuplicateProperty, Grapheme, lexPron.Symbolic); } } } else { distinctPronunciations.Add(lexPron); pronunciationIndex[lexPron.Symbolic] = pronunciationCount; pronunciationCount++; } } _pronunciations = distinctPronunciations; int invalidPronNodeNum = 0; foreach (LexiconPronunciation lexPron in Pronunciations) { // lexPron.Valid will be false if contains error. ValidatePronunciation(Grapheme, lexPron, ttsPhoneSet, errorSet); // Validate the POS information int invalidPropertyNum = 0; foreach (LexiconItemProperty property in lexPron.Properties) { // Lexicon schema ensures that the POS property is existed Debug.Assert(property.PartOfSpeech != null); if (PosItem.Validate(property.PartOfSpeech.Value, ttsPosSet, attributeSchema).Count > 0) { errorSet.Add(LexiconError.UnrecognizedPos, Grapheme, lexPron.Symbolic, property.PartOfSpeech.Value); property.Valid = false; } if (attributeSchema != null) { ErrorSet attributeErrorSet = ValidateAttributeSet(property, attributeSchema); foreach (Error error in attributeErrorSet.Errors) { errorSet.Add(LexiconError.AttributeError, error, Grapheme, lexPron.Symbolic); } if (attributeErrorSet.Contains(ErrorSeverity.MustFix)) { property.Valid = false; } } if (property.AttributeSet.Count > 0 && (property.Case != null || property.Gender != null || property.Number != null)) { errorSet.Add(LexiconError.MixedPropertyDefinition, Grapheme, lexPron.Symbolic); property.Valid = false; } else { ValidateCase(Grapheme, property, errorSet); ValidateGender(Grapheme, property, errorSet); ValidateNumber(Grapheme, property, errorSet); } if (!property.Valid) { invalidPropertyNum++; } } if (lexPron.Properties.Count == invalidPropertyNum) { lexPron.Valid = false; } if (!lexPron.Valid) { invalidPronNodeNum++; } } if (Pronunciations.Count == invalidPronNodeNum) { Valid = false; } return errorSet; }
/// <summary> /// Builds phone ids included in this forest. /// </summary> /// <param name="phoneSet">Phone set used to convert phone string label to phone instance.</param> public void BuildPhones(TtsPhoneSet phoneSet) { Helper.ThrowIfNull(phoneSet); _phones.Clear(); foreach (DecisionTree tree in TreeList) { if (!_phones.ContainsKey(tree.Phone)) { _phones.Add(tree.Phone, phoneSet.ToPhone(tree.Phone)); } } }
/// <summary> /// Extract Domain Lexicon from script. /// </summary> /// <param name="scriptFolder">Script Folder.</param> /// <param name="domainListFile">Domain List File.</param> /// <param name="inMainLex">Input Main Lexicon.</param> /// <param name="defaultPartOfSpeech">Default Part of Speech.</param> /// <param name="mergeMode">Merging Mode for Lexicon.</param> /// <param name="phoneSet">Phone set.</param> /// <param name="attribSchema">Lexical attribute schema.</param> /// <returns>Lexicon.</returns> private Lexicon ExtractDomainLexicon(string scriptFolder, string domainListFile, Lexicon inMainLex, string defaultPartOfSpeech, MergeMode mergeMode, TtsPhoneSet phoneSet, LexicalAttributeSchema attribSchema) { if (attribSchema != null) { if (PosItem.Validate(defaultPartOfSpeech, null, attribSchema).Count > 0) { Log("Default Part of speech {0} is unrecognized according to attribute schema, extraction breaks", defaultPartOfSpeech); return null; } } Lexicon outLex = null; foreach (string domainName in Helper.FileLines(domainListFile)) { string domainFilePath = Path.Combine(scriptFolder, domainName); XmlScriptFile scriptFile = new XmlScriptFile(); scriptFile.Load(domainFilePath); if (outLex != null && outLex.Language != scriptFile.Language) { throw new InvalidDataException(Helper.NeutralFormat( "Found inconsistent language \"{0}\" against previous one \"{1}\" in the file of \"{2}\"", scriptFile.Language.ToString(), outLex.Language.ToString(), domainFilePath)); } Lexicon lexicon = Lexicon.CreateFromXmlScriptFile(scriptFile, defaultPartOfSpeech, inMainLex); if (phoneSet != null && attribSchema != null) { lexicon.Validate(phoneSet, attribSchema); if (lexicon.ErrorSet.Count > 0) { Console.Error.WriteLine("The script file {0} contains {1} errors, skip!", domainFilePath, lexicon.ErrorSet.Count); Log("The script file {0} contains {1} errors:", domainFilePath, lexicon.ErrorSet.Count); foreach (Error error in lexicon.ErrorSet.Errors) { Log(error.ToString()); } // Skip this domain lexicon continue; } } if (outLex == null) { outLex = lexicon; } else { MergeLexicon(outLex, lexicon, mergeMode); } } if (outLex.Items.Count == 0) { Log("The final lexicon is empty."); } return outLex; }
/// <summary> /// Builds a map between the phone name and its type id. /// </summary> /// <param name="phoneSet">The given phoneset.</param> /// <returns>The Dictionary which key is unit name and the value is unit index id.</returns> public static Dictionary<string, int> BuildPhoneNameIdMap(TtsPhoneSet phoneSet) { // Adds the phone one by one. int maxId = 0; Dictionary<string, int> result = new Dictionary<string, int>(); foreach (Phone phone in phoneSet.Phones) { // Please notice here, the phone set file contains the runtime silence, but not the silence. string name = Phoneme.ToHtk(phone.Name); result.Add(name, phone.Id); if (phone.Id > maxId) { maxId = phone.Id; } } // Ensures there is continuous and starts from 0. for (int i = 0; i <= maxId; ++i) { if (!result.ContainsValue(i)) { // Adds a null phoneme here for padding. result.Add(Helper.NeutralFormat("_{0}_{1}_", Phoneme.Null, i), i); } } return result; }