/// <summary> /// Validate the config. /// </summary> /// <param name="attrSchema">LexicalAttributeSchema.</param> public void Validate(LexicalAttributeSchema attrSchema) { TtsPosSet posSet = TtsPosSet.LoadPosTaggingPosFromSchema(attrSchema); if (posSet.Language != attrSchema.Language) { throw new InvalidDataException(Helper.NeutralFormat("Mismatch language between " + "attribute schema [{0}] and common rule [{1}]", Localor.LanguageToString(attrSchema.Language), Localor.LanguageToString(Language))); } if (!posSet.Items.ContainsKey(_defaultPos)) { throw new InvalidDataException(Helper.NeutralFormat( "Can't find default pos [{0}] in pos tagging pos set", _defaultPos)); } foreach (CharFirstRuleConfig charFirstRuleConfig in _charFirstRules) { if (!posSet.Items.ContainsKey(charFirstRuleConfig.TargetPos)) { throw new InvalidDataException(Helper.NeutralFormat( "Can't find first char rule [charList={0}]'s pos [{1}] in pos tagging pos set", charFirstRuleConfig.FirstCharList, charFirstRuleConfig.TargetPos)); } foreach (char c in charFirstRuleConfig.FirstCharList) { if (char.IsWhiteSpace(c)) { throw new InvalidDataException(Helper.NeutralFormat( "Can't contain white space in first char list [{0}]", charFirstRuleConfig.FirstCharList)); } } } }
/// <summary> /// Parse XML document for Lexicon Schema File path. /// </summary> /// <param name="dom">XML configuration document.</param> /// <param name="nsmgr">Namespace.</param> private void ParseLexiconSchema(XmlDocument dom, XmlNamespaceManager nsmgr) { LexiconSchemaFile = ParseFilePath(dom, nsmgr, LexiconSchemaFileItem); if (!Helper.FileValidExists(LexiconSchemaFile)) { throw new FileNotFoundException( string.Format(CultureInfo.InvariantCulture, "Lexicon schema file \"{0}\" not found", LexiconSchemaFile)); } LexicalAttributeSchema attributeSchema = new LexicalAttributeSchema(FontLanguage); attributeSchema.Load(LexiconSchemaFile); attributeSchema.Validate(); if (attributeSchema.ErrorSet.Contains(ErrorSeverity.MustFix)) { attributeSchema.ErrorSet.Export(Console.Error); throw new InvalidDataException( string.Format(CultureInfo.InvariantCulture, "Please fix the error of lexicon schema file \"{0}\"", LexiconSchemaFile)); } PosSet = TtsPosSet.LoadFromSchema(LexiconSchemaFile); }
/// <summary> /// Validate Lexicon according to TTS phone set and pos set or lexical attribute schema. /// </summary> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="ttsPosSet">TTS POS set.</param> /// <param name="attributeSchema">Lexical attribute Schema.</param> private void Validate(TtsPhoneSet ttsPhoneSet, TtsPosSet ttsPosSet, LexicalAttributeSchema attributeSchema) { Debug.Assert(ttsPhoneSet != null); Debug.Assert(ttsPosSet != null || attributeSchema != null); bool dependentDataValid = true; ttsPhoneSet.Validate(); if (ttsPhoneSet.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "Phone set"); dependentDataValid = false; } if (ttsPosSet != null) { ttsPosSet.Validate(); if (ttsPosSet.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "POS set"); dependentDataValid = false; } } if (attributeSchema != null) { attributeSchema.Validate(); if (attributeSchema.ErrorSet.Contains(ErrorSeverity.MustFix)) { ErrorSet.Add(LexiconError.InvalidDependentData, "Lexical Attribute Schema"); dependentDataValid = false; } } if (dependentDataValid) { bool containValidItem = false; foreach (LexicalItem lexItem in Items.Values) { ErrorSet errorSet = lexItem.Validate(ttsPhoneSet, ttsPosSet, attributeSchema); ErrorSet.Merge(errorSet); containValidItem = containValidItem || lexItem.Valid; } if (!containValidItem) { ErrorSet.Add(LexiconError.EmptyLexicon); } } validated = true; }
/// <summary> /// Validate Lexicon according to phone set and lexical attribute schema. /// </summary> /// <param name="ttsPhoneSet">TTS phone set.</param> /// <param name="attributeSchema">TTS attribute schema.</param> public void Validate(TtsPhoneSet ttsPhoneSet, LexicalAttributeSchema attributeSchema) { if (attributeSchema == null) { throw new ArgumentNullException("attributeSchema"); } if (ttsPhoneSet == null) { throw new ArgumentNullException("ttsPhoneSet"); } if (!attributeSchema.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", attributeSchema.Language.ToString(), "lexical attribute Schema")); } if (!ttsPhoneSet.Language.Equals(Language)) { throw new InvalidDataException(Error.BuildMessage(CommonError.NotConsistentLanguage, Language.ToString(), "lexicon", ttsPhoneSet.Language.ToString(), "phone set")); } Validate(ttsPhoneSet, null, attributeSchema); }
/// <summary> /// Parse. /// </summary> /// <param name="line">Line.</param> /// <param name="attributeSchema">LexicalAttributeSchema.</param> /// <returns>ErrorSet.</returns> public ErrorSet Parse(string line, LexicalAttributeSchema attributeSchema) { if (string.IsNullOrEmpty(line)) { throw new ArgumentNullException("line"); } ErrorSet errorSet = new ErrorSet(); _words.Clear(); string[] wordWithPosTags = line.Split(_wordDelimeters, StringSplitOptions.RemoveEmptyEntries); foreach (string wordWithPosTag in wordWithPosTags) { PosCorpusWord word = new PosCorpusWord(); ErrorSet wordErrorSet = word.Parse(wordWithPosTag, attributeSchema); errorSet.AddRange(wordErrorSet); if (wordErrorSet.Count == 0) { _words.Add(word); } } return errorSet; }
/// <summary> /// Parse word pos pair. /// </summary> /// <param name="wordPosPair">Word.</param> /// <param name="attributeSchema">AttributeSchema.</param> /// <returns>ErrorSet.</returns> public ErrorSet Parse(string wordPosPair, LexicalAttributeSchema attributeSchema) { ErrorSet errorSet = new ErrorSet(); int slashIndex = wordPosPair.LastIndexOf(WordPosDelimeter); if (slashIndex < 0 || slashIndex > wordPosPair.Length - 1) { errorSet.Add(PosCorpusError.InvalidFormat, wordPosPair); } else if (slashIndex == 0) { errorSet.Add(PosCorpusError.EmptyWord, wordPosPair); } else if (slashIndex == wordPosPair.Length - 1) { errorSet.Add(PosCorpusError.EmptyPos, wordPosPair); } else { WordText = wordPosPair.Substring(0, slashIndex); string originalPos = wordPosPair.Substring(slashIndex + 1); if (attributeSchema != null) { string posTaggingPos = attributeSchema.GetPosTaggingPos(originalPos); if (string.IsNullOrEmpty(posTaggingPos)) { errorSet.Add(PosCorpusError.NoPosTaggingPos, originalPos); } else { Pos = posTaggingPos; } } else { Pos = originalPos; } } return errorSet; }
/// <summary> /// Save schema (with feature set and it's value group, mean, variance). /// </summary> /// <param name="language">The language.</param> /// <param name="schemaFile">The schema File.</param> /// <param name="phoneToIdIndexes">Phone To Id Indexes.</param> /// <param name="writer">Writer.</param> /// <param name="stringPool">String pool.</param> /// <returns>Size of bytes written out.</returns> public uint WriteSchema(Language language, string schemaFile, Dictionary<string, string> phoneToIdIndexes, DataWriter writer, StringPool stringPool) { Helper.ThrowIfFileNotExist(schemaFile); Helper.ThrowIfNull(phoneToIdIndexes); Helper.ThrowIfNull(writer); Helper.ThrowIfNull(stringPool); Helper.ThrowIfNull(language); uint size = 0; LexicalAttributeSchema schema = new LexicalAttributeSchema(language); schema.Load(schemaFile); List<string> stateFeatureList = new List<string>(); List<string> featureList = new List<string>(); int stateFeatureCount = 0; for (int i = 0; i < schema.Categories.Count; i++) { string name = schema.Categories[i].Name.ToLower(); if (name.IndexOf("state") >= 0) { stateFeatureCount++; if (!stateFeatureList.Contains(name)) { stateFeatureList.Add(name); } } if (!featureList.Contains(name)) { featureList.Add(name); } } // write state feature count. size += writer.Write((uint)stateFeatureList.Count); size += writer.Write((uint)stateFeatureCount); // write total feature count. size += writer.Write((uint)featureList.Count()); Dictionary<string, uint> featureIndex = new Dictionary<string, uint>(); uint index = 0; foreach (string feature in featureList) { size += writer.Write((uint)stringPool.Length); stringPool.PutString(feature); featureIndex.Add(feature, index++); } // write feature category size += writer.Write((uint)schema.Categories.Count); for (int i = 0; i < schema.Categories.Count; i++) { string featureName = schema.Categories[i].Name.ToLower(); // feature index size += writer.Write((uint)featureIndex[featureName]); // mean size += writer.Write(schema.Categories[i].Mean); // invStdDev size += writer.Write(schema.Categories[i].InvStdDev); // value count size += writer.Write((uint)schema.Categories[i].Values.Count); for (int k = 0; k < schema.Categories[i].Values.Count; k++) { string valueName = schema.Categories[i].Values[k].Name.ToLower(); string id = string.Empty; if (phoneToIdIndexes.ContainsKey(valueName) && featureName.IndexOf("phoneidentity") >= 0) { id = phoneToIdIndexes[valueName]; } else { id = valueName; } try { size += writer.Write(uint.Parse(id)); } catch (System.FormatException) { continue; } } } Debug.Assert(size % sizeof(uint) == 0, "Data must be 4-byte aligned."); return size; }
/// <summary> /// Load. /// </summary> /// <param name="filePath">FilePath.</param> /// <param name="attributeSchema">LexicalAttributeSchema.</param> /// <returns>The errotset.</returns> public ErrorSet Load(string filePath, LexicalAttributeSchema attributeSchema) { if (string.IsNullOrEmpty(filePath)) { throw new ArgumentNullException("filePath"); } if (!File.Exists(filePath)) { throw Helper.CreateException(typeof(FileNotFoundException), filePath); } if (!Helper.IsUnicodeFile(filePath)) { throw new InvalidDataException(Helper.NeutralFormat( "Invalid corpus file format(not UNICODE), should be UNICODE.")); } _paragraphs.Clear(); int lineNumber = 0; ErrorSet errorSetWithLine = new ErrorSet(); foreach (string line in Helper.FileLines(filePath, Encoding.Unicode, false)) { lineNumber ++; if (string.IsNullOrEmpty(line.Trim())) { continue; } PosCorpusParagraph paragraph = new PosCorpusParagraph(); ErrorSet errorSet = paragraph.Parse(line, attributeSchema); if (errorSet.Errors.Count == 0) { Debug.Assert(paragraph.Words.Count > 0); _paragraphs.Add(paragraph); } else { foreach (Error error in errorSet.Errors) { errorSetWithLine.Add(PosCorpusError.ErrorWithLine, error, lineNumber.ToString(CultureInfo.InvariantCulture)); } } } _filePath = filePath; return errorSetWithLine; }
/// <summary> /// Validate attribute set for the word. /// </summary> /// <param name="property">Lexicon item property.</param> /// <param name="attributeSchema">Lexical Attribute Schema.</param> /// <returns>Error set.</returns> private static ErrorSet ValidateAttributeSet(LexiconItemProperty property, LexicalAttributeSchema attributeSchema) { Debug.Assert(attributeSchema != null); ErrorSet attributeErrorSet = new ErrorSet(); foreach (KeyValuePair<string, List<AttributeItem>> pair in property.AttributeSet) { foreach (AttributeItem attribute in pair.Value) { AttributeCategory category = attributeSchema.GetRootCategory(pair.Key); if (category == null) { attributeErrorSet.Add(LexicalAttributeError.InvalidCategory, pair.Key); } else if (category.Name.Equals(LexicalAttributeSchema.PosCategoryName, StringComparison.Ordinal)) { attributeErrorSet.Add(LexicalAttributeError.InvalidDefinitionForPos, attribute.Value); } else { bool found = false; foreach (AttributeValue value in category.Values) { if (value.Name.Equals(attribute.Value, StringComparison.Ordinal)) { found = true; break; } } if (!found) { attributeErrorSet.Add(LexicalAttributeError.InvalidValue, attribute.Value, pair.Key); } } } } return attributeErrorSet; }
/// <summary> /// Load Lexicon Attribute Schema Data object. /// </summary> /// <param name="errorSet">ErrorSet.</param> /// <returns>Lexicon Attribute Schema Data object.</returns> internal override object LoadDataObject(ErrorSet errorSet) { if (errorSet == null) { throw new ArgumentNullException("errorSet"); } LexicalAttributeSchema schema = new LexicalAttributeSchema(); schema.Load(this.Path); schema.Validate(); errorSet.Merge(schema.ErrorSet); if (schema.ErrorSet.Contains(ErrorSeverity.MustFix)) { schema = null; } return schema; }
/// <summary> /// Validate lexicon item. /// </summary> /// <param name="ttsPhoneSet">Phone set to validate lexicon item's pronunciation.</param> /// <param name="ttsPosSet">Pos set of the lexicon item.</param> /// <param name="attributeSchema">Attribute schema.</param> /// <returns>Error set of the validation.</returns> public ErrorSet Validate(TtsPhoneSet ttsPhoneSet, TtsPosSet ttsPosSet, LexicalAttributeSchema attributeSchema) { Debug.Assert(ttsPhoneSet != null); Debug.Assert(ttsPosSet != null || attributeSchema != null); ErrorSet errorSet = new ErrorSet(); // Merge duplicate pronunciation node Collection<LexiconPronunciation> distinctPronunciations = new Collection<LexiconPronunciation>(); Dictionary<string, int> pronunciationIndex = new Dictionary<string, int>(); int pronunciationCount = 0; foreach (LexiconPronunciation lexPron in Pronunciations) { // Validate duplicate pronunciation node if (pronunciationIndex.ContainsKey(lexPron.Symbolic)) { errorSet.Add(LexiconError.DuplicatePronunciationNode, Grapheme, lexPron.Symbolic); lexPron.Valid = false; foreach (LexiconItemProperty property in lexPron.Properties) { Collection<LexiconItemProperty> targetProperties = distinctPronunciations[pronunciationIndex[lexPron.Symbolic]].Properties; if (!targetProperties.Contains(property)) { targetProperties.Add(property); } else { errorSet.Add(LexiconError.DuplicateProperty, Grapheme, lexPron.Symbolic); } } } else { distinctPronunciations.Add(lexPron); pronunciationIndex[lexPron.Symbolic] = pronunciationCount; pronunciationCount++; } } _pronunciations = distinctPronunciations; int invalidPronNodeNum = 0; foreach (LexiconPronunciation lexPron in Pronunciations) { // lexPron.Valid will be false if contains error. ValidatePronunciation(Grapheme, lexPron, ttsPhoneSet, errorSet); // Validate the POS information int invalidPropertyNum = 0; foreach (LexiconItemProperty property in lexPron.Properties) { // Lexicon schema ensures that the POS property is existed Debug.Assert(property.PartOfSpeech != null); if (PosItem.Validate(property.PartOfSpeech.Value, ttsPosSet, attributeSchema).Count > 0) { errorSet.Add(LexiconError.UnrecognizedPos, Grapheme, lexPron.Symbolic, property.PartOfSpeech.Value); property.Valid = false; } if (attributeSchema != null) { ErrorSet attributeErrorSet = ValidateAttributeSet(property, attributeSchema); foreach (Error error in attributeErrorSet.Errors) { errorSet.Add(LexiconError.AttributeError, error, Grapheme, lexPron.Symbolic); } if (attributeErrorSet.Contains(ErrorSeverity.MustFix)) { property.Valid = false; } } if (property.AttributeSet.Count > 0 && (property.Case != null || property.Gender != null || property.Number != null)) { errorSet.Add(LexiconError.MixedPropertyDefinition, Grapheme, lexPron.Symbolic); property.Valid = false; } else { ValidateCase(Grapheme, property, errorSet); ValidateGender(Grapheme, property, errorSet); ValidateNumber(Grapheme, property, errorSet); } if (!property.Valid) { invalidPropertyNum++; } } if (lexPron.Properties.Count == invalidPropertyNum) { lexPron.Valid = false; } if (!lexPron.Valid) { invalidPronNodeNum++; } } if (Pronunciations.Count == invalidPronNodeNum) { Valid = false; } return errorSet; }
/// <summary> /// Validate pos string. /// </summary> /// <param name="posStr">Pos string to be validated.</param> /// <param name="ttsPosSet">Tts pos set for validation.</param> /// <param name="attributeSchema">Lexicon attribute schema for validation.</param> /// <returns>Validate result.</returns> public static ErrorSet Validate(string posStr, TtsPosSet ttsPosSet, LexicalAttributeSchema attributeSchema) { ErrorSet errorSet = new ErrorSet(); if ((attributeSchema != null && string.IsNullOrEmpty(attributeSchema.GenerateString( LexicalAttributeSchema.PosCategoryName, posStr))) || (ttsPosSet != null && !ttsPosSet.Items.ContainsKey(posStr))) { errorSet.Add(PosError.UnrecognizedPos, posStr); } return errorSet; }
/// <summary> /// Extract Domain Lexicon from script. /// </summary> /// <param name="scriptFolder">Script Folder.</param> /// <param name="domainListFile">Domain List File.</param> /// <param name="inMainLex">Input Main Lexicon.</param> /// <param name="defaultPartOfSpeech">Default Part of Speech.</param> /// <param name="mergeMode">Merging Mode for Lexicon.</param> /// <param name="phoneSet">Phone set.</param> /// <param name="attribSchema">Lexical attribute schema.</param> /// <returns>Lexicon.</returns> private Lexicon ExtractDomainLexicon(string scriptFolder, string domainListFile, Lexicon inMainLex, string defaultPartOfSpeech, MergeMode mergeMode, TtsPhoneSet phoneSet, LexicalAttributeSchema attribSchema) { if (attribSchema != null) { if (PosItem.Validate(defaultPartOfSpeech, null, attribSchema).Count > 0) { Log("Default Part of speech {0} is unrecognized according to attribute schema, extraction breaks", defaultPartOfSpeech); return null; } } Lexicon outLex = null; foreach (string domainName in Helper.FileLines(domainListFile)) { string domainFilePath = Path.Combine(scriptFolder, domainName); XmlScriptFile scriptFile = new XmlScriptFile(); scriptFile.Load(domainFilePath); if (outLex != null && outLex.Language != scriptFile.Language) { throw new InvalidDataException(Helper.NeutralFormat( "Found inconsistent language \"{0}\" against previous one \"{1}\" in the file of \"{2}\"", scriptFile.Language.ToString(), outLex.Language.ToString(), domainFilePath)); } Lexicon lexicon = Lexicon.CreateFromXmlScriptFile(scriptFile, defaultPartOfSpeech, inMainLex); if (phoneSet != null && attribSchema != null) { lexicon.Validate(phoneSet, attribSchema); if (lexicon.ErrorSet.Count > 0) { Console.Error.WriteLine("The script file {0} contains {1} errors, skip!", domainFilePath, lexicon.ErrorSet.Count); Log("The script file {0} contains {1} errors:", domainFilePath, lexicon.ErrorSet.Count); foreach (Error error in lexicon.ErrorSet.Errors) { Log(error.ToString()); } // Skip this domain lexicon continue; } } if (outLex == null) { outLex = lexicon; } else { MergeLexicon(outLex, lexicon, mergeMode); } } if (outLex.Items.Count == 0) { Log("The final lexicon is empty."); } return outLex; }
/// <summary> /// Validate language data files. /// </summary> /// <param name="language">Language of the data files.</param> /// <returns>Error set.</returns> public ErrorSet ValidateLanguageData(Language language) { ErrorSet errorSet = new ErrorSet(); if (!IsEmpty()) { if (!string.IsNullOrEmpty(_phoneSet)) { TtsPhoneSet ttsPhoneSet = new TtsPhoneSet(); ttsPhoneSet.Load(PhoneSet); if (ttsPhoneSet.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(ttsPhoneSet.Language), Localor.PhoneSetFileName, PhoneSet)); } } if (!string.IsNullOrEmpty(_unitTable)) { SliceData sliceData = new SliceData(); sliceData.Language = language; sliceData.Load(UnitTable); if (sliceData.IsEmpty()) { errorSet.Add(new Error(VoiceCreationLanguageDataError.EmptyLanguageDataFile, Localor.LanguageToString(language), Localor.UnitTableFileName, UnitTable)); } } if (!string.IsNullOrEmpty(_lexicalAttributeSchema)) { LexicalAttributeSchema lexicalAttributeSchema = new LexicalAttributeSchema(); lexicalAttributeSchema.Load(LexicalAttributeSchema); if (lexicalAttributeSchema.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(lexicalAttributeSchema.Language), Localor.PhoneSetFileName, LexicalAttributeSchema)); } } if (!string.IsNullOrEmpty(_truncateRule)) { TruncateRuleData truncateRuleData = new TruncateRuleData(); truncateRuleData.Load(TruncateRule); if (truncateRuleData.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(truncateRuleData.Language), Localor.TruncateRulesFileName, TruncateRule)); } } if (!string.IsNullOrEmpty(_ttsToSapiVisemeId)) { PhoneMap phoneMap = PhoneMap.CreatePhoneMap(TtsToSapiVisemeId); if (phoneMap.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(phoneMap.Language), Localor.TtsToSapiVisemeIdFileName, TtsToSapiVisemeId)); } } if (!string.IsNullOrEmpty(_ttsToSrPhone)) { PhoneMap phoneMap = PhoneMap.CreatePhoneMap(TtsToSrPhone); if (phoneMap.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(phoneMap.Language), Localor.TtsToSrPhoneFileName, TtsToSrPhone)); } } if (!string.IsNullOrEmpty(_ttsToIpaPhone)) { PhoneMap phoneMap = PhoneMap.CreatePhoneMap(TtsToIpaPhone); if (phoneMap.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(phoneMap.Language), Localor.TtsToIpaPhoneFileName, TtsToIpaPhone)); } } if (!string.IsNullOrEmpty(_fontMeta)) { PhoneMap phoneMap = PhoneMap.CreatePhoneMap(FontMeta); if (phoneMap.Language != language) { errorSet.Add(new Error(VoiceCreationLanguageDataError.MismatchLanguage, Localor.LanguageToString(language), Localor.LanguageToString(phoneMap.Language), Localor.FontMetaFileName, FontMeta)); } } } else { Trace.WriteLine("Using stocked language data with tools..."); } return errorSet; }
/// <summary> /// Get Lexicon specific language. /// </summary> /// <param name="language">The language.</param> /// <returns>The lexical attribute schema.</returns> public static LexicalAttributeSchema GetLexicalAttributeSchema(Language language) { LexicalAttributeSchema schema = null; if (_ttsLexicalAttributeSchemaMap.ContainsKey(language)) { schema = _ttsLexicalAttributeSchemaMap[language]; } else { using (StreamReader reader = Localor.LoadResource(language, Localor.LexicalAttributeSchemaFileName)) { if (reader != null) { schema = new LexicalAttributeSchema(language); schema.Load(reader); _ttsLexicalAttributeSchemaMap[language] = schema; } } } return schema; }