/// <summary> /// Load Lexicon Data object. /// </summary> /// <param name="errorSet">ErrorSet.</param> /// <returns>Lexicon Data object.</returns> internal override object LoadDataObject(ErrorSet errorSet) { Lexicon lexicon = new Lexicon(this.Language); Lexicon.ContentControler lexiconControler = new Lexicon.ContentControler(); lexiconControler.IsCaseSensitive = true; lexicon.Load(this.Path, lexiconControler); return lexicon; }
public static Lexicon GetLexicon(Language language) { Lexicon lexicon = null; if (_ttsLexiconMap.ContainsKey(language)) { lexicon = _ttsLexiconMap[language]; } else { using (StreamReader reader = Localor.LoadResource(language, Localor.LexiconFileName)) { if (reader != null) { lexicon = new Lexicon(language); lexicon.Load(reader); _ttsLexiconMap[language] = lexicon; } } } return lexicon; }
/// <summary> /// Initialize the validator. /// </summary> public void EnsureInitialized() { Debug.Assert(LexiconFilePath != null || Lexicon != null); Debug.Assert(PhoneSetFilePath != null || PhoneSet != null); if (_lexicon == null) { _lexicon = new Lexicon(); _lexicon.Load(LexiconFilePath); } if (_phoneset == null) { _phoneset = new TtsPhoneSet(); _phoneset.Load(PhoneSetFilePath); } if (_phoneset.Language != _lexicon.Language) { string message = Utility.Helper.NeutralFormat( "phoneset and lexicon language should match"); throw new InvalidDataException(message); } _language = _lexicon.Language; }
/// <summary> /// Import domain lexicon into current lexicon. /// </summary> /// <param name="domainLex">Domain lexicon.</param> /// <param name="trustDomainLexicon">Whether domain lexion is trusting.</param> public void ImportDomainLexicon(Lexicon domainLex, bool trustDomainLexicon) { if (!string.IsNullOrEmpty(_domainTag)) { throw new InvalidDataException( string.Format("Target lexicon is not a unified lexicon, it is in \"{0}\" domain.", _domainTag)); } if (!ValidateDomainLexicon(domainLex)) { throw new InvalidDataException("The lexicon to import is not a domain lexicon."); } foreach (LexicalItem domainLexItem in domainLex.Items.Values) { if (_items.ContainsKey(domainLexItem.Grapheme)) { ErrorSet importError = _items[domainLexItem.Grapheme].ImportDomainLexicalItem(domainLexItem, domainLex.DomainTag, trustDomainLexicon); ErrorSet.Merge(importError); } else { LexicalItem clonedItem = domainLexItem.Clone(); clonedItem.Frequency = 0; foreach (LexiconPronunciation pron in clonedItem.Pronunciations) { pron.Frequency = 0; } _items.Add(domainLexItem.Grapheme, clonedItem); } } }
/// <summary> /// Create the lexicon from Xml Script file. /// </summary> /// <param name="scriptFile">Xml script file.</param> /// <param name="defaultPos">Part of Speech String.</param> /// <param name="mainLexicon">MainLexicon.</param> /// <returns>Lexicon.</returns> public static Lexicon CreateFromXmlScriptFile(XmlScriptFile scriptFile, string defaultPos, Lexicon mainLexicon) { if (scriptFile == null) { throw new ArgumentNullException("scriptFile"); } if (string.IsNullOrEmpty(defaultPos)) { throw new ArgumentNullException("defaultPos"); } Lexicon lexicon = new Lexicon(scriptFile.Language); foreach (ScriptItem item in scriptFile.Items) { foreach (ScriptWord scriptWord in item.AllPronouncedWords) { string word = scriptWord.Grapheme; // Create LexiconPronunciaton Node LexiconPronunciation pron = new LexiconPronunciation(lexicon.Language); pron.Symbolic = scriptWord.Pronunciation; if (mainLexicon != null) { LexicalItem mainLexiconItem = mainLexicon.Lookup(word, true); if (mainLexiconItem != null) { LexiconPronunciation lexPron = mainLexiconItem.FindPronunciation(pron.Symbolic, true); if (lexPron != null) { pron.Symbolic = lexPron.Symbolic; } } } LexiconItemProperty property = new LexiconItemProperty(); if (string.IsNullOrEmpty(scriptWord.PosString)) { property.PartOfSpeech = new PosItem(defaultPos); } else { property.PartOfSpeech = new PosItem(scriptWord.PosString); } pron.Properties.Add(property); if (!lexicon.Items.ContainsKey(word)) { LexicalItem lexicalItem = new LexicalItem(lexicon.Language); lexicalItem.Grapheme = word; lexicalItem.Pronunciations.Add(pron); lexicon.Items.Add(word, lexicalItem); } else { bool needAdd = true; foreach (LexiconPronunciation pronunciation in lexicon.Items[word].Pronunciations) { if (pronunciation.Symbolic.Equals(pron.Symbolic, StringComparison.InvariantCultureIgnoreCase)) { needAdd = false; if (!pronunciation.Properties.Contains(property)) { pronunciation.Properties.Add(property); } } } if (needAdd) { lexicon.Items[word].Pronunciations.Add(pron); } } } } return lexicon; }
public static void WriteAllData(string lexiconFilePath, Lexicon lexicon, Encoding encoding) { lexicon.Save(lexiconFilePath, encoding); }
/// <summary> /// Load LexicalItem from XmlNode. /// </summary> /// <param name="parentLexicon">Lexicon.</param> /// <param name="wordNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> private void LoadLexicalItem(Lexicon parentLexicon, XmlNode wordNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController) { LexicalItem lexiconItem = LexicalItem.Load(parentLexicon, wordNode, nsmgr, contentController, ErrorSet); // If no pronunciation at last, we drop the word item. if (lexiconItem != null && lexiconItem.Pronunciations.Count > 0) { if (_items.ContainsKey(lexiconItem.Grapheme)) { ErrorSet.Add(LexiconError.DuplicateWordEntry, lexiconItem.Grapheme); foreach (LexiconPronunciation pronunciation in lexiconItem.Pronunciations) { pronunciation.Parent = _items[lexiconItem.Grapheme]; _items[lexiconItem.Grapheme].Pronunciations.Add(pronunciation); } } else { if (parentLexicon._isBaseline) { lexiconItem.Origin = LexiconOrigin.Baseline; } else { lexiconItem.Origin = LexiconOrigin.Current; } _items.Add(lexiconItem.Grapheme, lexiconItem); } } }
/// <summary> /// Load XML file. /// </summary> /// <param name="xmlDoc">XmlDoc.</param> /// <param name="nsmgr">Nsmgr.</param> /// <param name="contentController">Content controller.</param> protected override void Load(XmlDocument xmlDoc, XmlNamespaceManager nsmgr, object contentController) { ContentControler lexiconContentController = contentController as ContentControler; Debug.Assert(contentController == null || lexiconContentController != null); if (lexiconContentController == null) { lexiconContentController = new ContentControler(); } Language language = Localor.StringToLanguage(xmlDoc.DocumentElement.Attributes["lang"].InnerText); if (!Language.Equals(Language.Neutral) && !language.Equals(Language)) { ErrorSet.Add(CommonError.NotConsistentLanguage, Language.ToString(), "initial one", language.ToString(), "lexicon"); } Language = language; if (xmlDoc.DocumentElement.Attributes["domain"] != null) { string domainTag = xmlDoc.DocumentElement.Attributes["domain"].InnerText; if (!string.IsNullOrEmpty(domainTag)) { DomainTag = domainTag; } } // Load current lexicon _items.Clear(); XmlNodeList wordNodes = xmlDoc.DocumentElement.SelectNodes("tts:w", nsmgr); foreach (XmlNode wordNode in wordNodes) { LoadLexicalItem(this, wordNode, nsmgr, lexiconContentController); } // Get baseline lexicon file path string baseLexiconFilePath = string.Empty; if (xmlDoc.DocumentElement.FirstChild != null && xmlDoc.DocumentElement.FirstChild.LocalName == "include" && xmlDoc.DocumentElement.FirstChild.Attributes["href"] != null) { BaseLexiconRelativeFilePath = xmlDoc.DocumentElement.FirstChild.Attributes["href"].InnerText; if (!string.IsNullOrEmpty(BaseLexiconRelativeFilePath)) { baseLexiconFilePath = Helper.GetFullPath(Path.GetDirectoryName(this.FilePath), BaseLexiconRelativeFilePath); } } if (!string.IsNullOrEmpty(baseLexiconFilePath) && File.Exists(baseLexiconFilePath)) { Lexicon baseLexicon = new Lexicon(); baseLexicon._isBaseline = true; // Load baseline lexicon baseLexicon.Load(baseLexiconFilePath, lexiconContentController); // Merge current lexicon and baseline lexicon foreach (var baseItem in baseLexicon.Items) { // We drop those items if they have "deleted" status when LoadLexicalItem(), // so there's no deleted words in both lexicons. // if this item isn't in current lexicon, add it into current lexicon if (!_items.ContainsKey(baseItem.Key)) { _items.Add(baseItem.Key, baseItem.Value); } //// if this item is already in current lexicon, keep current word item /*else { } */ } } }
/// <summary> /// Merge lexicon. when facing conflict word, use the attributes in the first lexicon. /// </summary> /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param> /// <param name="subLexicon">Sub lexicon.</param> private void MergeLexiconWithKeepFirstOne(Lexicon mergedLexicon, Lexicon subLexicon) { Collection<string> skippedPronMessage = new Collection<string>(); int addedWord = 0; foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items) { string word = subLexiconItem.Key; // If the sub lexicon item doesn't exist in merged lexicon, then add it. LexicalItem originalItem = mergedLexicon.Lookup(word, true); if (originalItem == null) { mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value); addedWord++; continue; } foreach (LexiconPronunciation newPron in subLexiconItem.Value.Pronunciations) { if (!originalItem.ContainsPronunciation(newPron.Symbolic)) { skippedPronMessage.Add(Helper.NeutralFormat( "Pronunciation for word [{0}] has been skipped: [{1}]", subLexiconItem.Key, newPron.Symbolic)); } } } // Log the message { Log("---------------------------------"); Log("Totally:"); Log("[{0}] pronunciations have been skipped.", skippedPronMessage.Count); Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", addedWord)); } }
/// <summary> /// Merge lexicon, when facing conflict word, use the attributes in the last lexicon. /// </summary> /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param> /// <param name="subLexicon">Sub lexicon.</param> private void MergeLexiconWithKeepLastOne(Lexicon mergedLexicon, Lexicon subLexicon) { Collection<string> replacedPronMessage = new Collection<string>(); Collection<string> replacedPropertyMessage = new Collection<string>(); Collection<string> existedWords = new Collection<string>(); Collection<string> existedWordsInLower = new Collection<string>(); // Dump the conflict pronunciations or properties from the merged lexicon foreach (KeyValuePair<string, LexicalItem> mergedLexiconItem in mergedLexicon.Items) { string word = mergedLexiconItem.Key; LexicalItem newItem = subLexicon.Lookup(word, true); // If the sub lexicon contain the same grapheme, then delete the one in original lexicon. if (newItem != null) { existedWords.Add(word); existedWordsInLower.Add(word.ToLowerInvariant()); foreach (LexiconPronunciation originalPron in mergedLexiconItem.Value.Pronunciations) { LexiconPronunciation existedPron = newItem.FindPronunciation(originalPron.Symbolic); if (existedPron == null) { replacedPronMessage.Add(Helper.NeutralFormat( "Pronunciation for word [{0}] has been removed: [{1}]", mergedLexiconItem.Key, originalPron.Symbolic)); } else { foreach (LexiconItemProperty subItemProperty in originalPron.Properties) { bool hasProperty = false; // Find old properties in new(sub) item. foreach (LexiconItemProperty itemProperty in existedPron.Properties) { if (itemProperty.Equals(subItemProperty)) { hasProperty = true; break; } } // Add the property if doesn't contains it. if (!hasProperty) { replacedPropertyMessage.Add(Helper.NeutralFormat( "Property has been replaced for word [{0}]'s pronunciation [{1}] : [{2}]", word, originalPron.Symbolic, subItemProperty.ToString())); } } } } } } // Remove the duplicate word entries foreach (string word in existedWords) { mergedLexicon.Items.Remove(word); } // Add new word entries into merged lexicon. int newWord = 0; foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items) { mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value); if (!existedWordsInLower.Contains(subLexiconItem.Key.ToLowerInvariant())) { newWord++; } } // Log the Message { Log("---------------------------------"); Log("Totally:"); Log(Helper.NeutralFormat("[{0}] words have been replaced by the latter lexicon", replacedPronMessage.Count)); Log("[{0}] properties have been replaced.", replacedPropertyMessage.Count); Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", newWord)); } }
/// <summary> /// Merge lexicon, when facing conflict word, keep all attributes. /// </summary> /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param> /// <param name="subLexicon">Sub lexicon.</param> private void MergeLexiconWithKeepAll(Lexicon mergedLexicon, Lexicon subLexicon) { Collection<string> addedPronMessage = new Collection<string>(); Collection<string> addedPropertyMessage = new Collection<string>(); Collection<string> duplicateWordMessage = new Collection<string>(); int addedWord = 0; foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items) { string word = subLexiconItem.Key; if (!mergedLexicon.Items.ContainsKey(word)) { mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value); addedWord++; continue; } LexicalItem mergedItem = mergedLexicon.Items[word]; bool theSameWord = true; foreach (LexiconPronunciation subItemPron in subLexiconItem.Value.Pronunciations) { LexiconPronunciation mergedItemPron = null; // Find subLexiconItem's pronunciation in the merged item. foreach (LexiconPronunciation itemPron in mergedItem.Pronunciations) { if (itemPron.Symbolic.Equals(subItemPron.Symbolic, StringComparison.OrdinalIgnoreCase)) { mergedItemPron = itemPron; break; } } // If the pronunciation doesn't exist in merged item, then add it. if (mergedItemPron == null) { mergedItem.Pronunciations.Add(subItemPron); theSameWord = false; addedPronMessage.Add(Helper.NeutralFormat( "Pronunciation [{0}] has been added to word [{1}]", subItemPron.Symbolic, mergedItem.Grapheme)); } else { foreach (LexiconItemProperty subItemProperty in subItemPron.Properties) { bool hasProperty = false; // Find subLexiconItemPron's property in the merged item. foreach (LexiconItemProperty itemProperty in mergedItemPron.Properties) { if (itemProperty.Equals(subItemProperty)) { hasProperty = true; break; } } // Add the property if doesn't contains it. if (!hasProperty) { mergedItemPron.Properties.Add(subItemProperty); theSameWord = false; addedPropertyMessage.Add(Helper.NeutralFormat( "Property has been added to word [{0}]'s pronunciation [{1}] : [{2}]", mergedItem.Grapheme, subItemPron.Symbolic, subItemProperty.ToString())); } } } } if (theSameWord) { duplicateWordMessage.Add(Helper.NeutralFormat( "Word [{0}] has been dropped because of duplication.", subLexiconItem.Key)); } } // Log the message { Log("---------------------------------"); Log("Totally:"); Log("[{0}] words have been dropped because of duplication.", duplicateWordMessage.Count); Log("[{0}] pronunciations and [{1}] properties have been added.", addedPronMessage.Count, addedPropertyMessage.Count); } }
/// <summary> /// Merge lexicons. /// </summary> /// <param name="mergedLexicon">Lexicon to be merged to.</param> /// <param name="subLexicon">Lexicon to be merged.</param> /// <param name="mergeMode">MergeMode.</param> private void MergeLexicon(Lexicon mergedLexicon, Lexicon subLexicon, MergeMode mergeMode) { switch (mergeMode) { case MergeMode.KeepAll: MergeLexiconWithKeepAll(mergedLexicon, subLexicon); break; case MergeMode.KeepLastOne: MergeLexiconWithKeepLastOne(mergedLexicon, subLexicon); break; case MergeMode.KeepFirstOne: MergeLexiconWithKeepFirstOne(mergedLexicon, subLexicon); break; default: break; } }
/// <summary> /// Extract Domain Lexicon from script. /// </summary> /// <param name="scriptFolder">Script Folder.</param> /// <param name="domainListFile">Domain List File.</param> /// <param name="inMainLex">Input Main Lexicon.</param> /// <param name="defaultPartOfSpeech">Default Part of Speech.</param> /// <param name="mergeMode">Merging Mode for Lexicon.</param> /// <param name="phoneSet">Phone set.</param> /// <param name="attribSchema">Lexical attribute schema.</param> /// <returns>Lexicon.</returns> private Lexicon ExtractDomainLexicon(string scriptFolder, string domainListFile, Lexicon inMainLex, string defaultPartOfSpeech, MergeMode mergeMode, TtsPhoneSet phoneSet, LexicalAttributeSchema attribSchema) { if (attribSchema != null) { if (PosItem.Validate(defaultPartOfSpeech, null, attribSchema).Count > 0) { Log("Default Part of speech {0} is unrecognized according to attribute schema, extraction breaks", defaultPartOfSpeech); return null; } } Lexicon outLex = null; foreach (string domainName in Helper.FileLines(domainListFile)) { string domainFilePath = Path.Combine(scriptFolder, domainName); XmlScriptFile scriptFile = new XmlScriptFile(); scriptFile.Load(domainFilePath); if (outLex != null && outLex.Language != scriptFile.Language) { throw new InvalidDataException(Helper.NeutralFormat( "Found inconsistent language \"{0}\" against previous one \"{1}\" in the file of \"{2}\"", scriptFile.Language.ToString(), outLex.Language.ToString(), domainFilePath)); } Lexicon lexicon = Lexicon.CreateFromXmlScriptFile(scriptFile, defaultPartOfSpeech, inMainLex); if (phoneSet != null && attribSchema != null) { lexicon.Validate(phoneSet, attribSchema); if (lexicon.ErrorSet.Count > 0) { Console.Error.WriteLine("The script file {0} contains {1} errors, skip!", domainFilePath, lexicon.ErrorSet.Count); Log("The script file {0} contains {1} errors:", domainFilePath, lexicon.ErrorSet.Count); foreach (Error error in lexicon.ErrorSet.Errors) { Log(error.ToString()); } // Skip this domain lexicon continue; } } if (outLex == null) { outLex = lexicon; } else { MergeLexicon(outLex, lexicon, mergeMode); } } if (outLex.Items.Count == 0) { Log("The final lexicon is empty."); } return outLex; }
/// <summary> /// Execute. /// </summary> protected override void Execute() { if (this.InMode == LexiconProcessorMode.Merge) { Log("Merge Lexicon"); OutLexicon = new Lexicon(); OutLexicon.Language = InMainLexicon.Language; OutLexicon.Encoding = InMainLexicon.Encoding; foreach (KeyValuePair<string, LexicalItem> item in InMainLexicon.Items) { OutLexicon.Items.Add(item.Key, item.Value); } if (InAdditionalLexicon != null) { MergeLexicon(OutLexicon, InAdditionalLexicon, InMergeMode); } } else if (this.InMode == LexiconProcessorMode.ExtractDomainLexicon) { Log("Extract Domain Lexicon"); OutLexicon = ExtractDomainLexicon(InScriptFolder, InDomainListFile, InMainLexicon, InPos, InMergeMode, InPhoneSet, InAttribSchema); } else if (this.InMode == LexiconProcessorMode.ExtractSubLexicon) { Log("Extract Sub Lexicon"); OutLexicon = ExtractSubLexicon(InCorpusType, InCorpusFile, InMainLexicon); } else if (this.InMode == LexiconProcessorMode.ExtractWordListFromLexicon) { Log("Extract Word List from Lexicon"); OutWordList = InMainLexicon.ListWords(); } if (!string.IsNullOrEmpty(InSetOutLexiconPath) && this.InMode != LexiconProcessorMode.ExtractWordListFromLexicon && OutLexicon != null) { OutLexicon.Save(GetOutPathUnderResultDirectory(InSetOutLexiconPath)); } }
/// <summary> /// Load LexicalItem from XmlNode. /// </summary> /// <param name="parentLexicon">Lexicon.</param> /// <param name="wordNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>LexicalItem.</returns> internal static LexicalItem Load(Lexicon parentLexicon, XmlNode wordNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentLexicon != null && wordNode != null && nsmgr != null && contentController != null && errorSet != null); XmlElement wordElement = wordNode as XmlElement; LexicalItem lexiconItem = new LexicalItem(parentLexicon.Language); lexiconItem.Parent = parentLexicon; string grapheme = wordElement.GetAttribute("v"); if (string.IsNullOrEmpty(grapheme)) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.EmptyWord), grapheme); lexiconItem = null; } else if (!grapheme.Trim().Equals(grapheme, StringComparison.OrdinalIgnoreCase)) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.LeadingOrTrailingSpace), grapheme); lexiconItem = null; } else { Regex regex = new Regex("( )|\t"); if (regex.IsMatch(grapheme.Trim())) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.ContainingTabOrMultipleSpaces), grapheme); } } if (lexiconItem != null) { // Before share lexicon object to lexicon reviewer tool, // we drop those items if they have "deleted" status string statusValue = wordElement.GetAttribute("s"); if (!string.IsNullOrEmpty(statusValue)) { lexiconItem.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), statusValue, true); } if (!contentController.IsHistoryCheckingMode && lexiconItem.Status == Lexicon.LexiconStatus.Deleted) { lexiconItem = null; } else { lexiconItem.Alias = wordElement.GetAttribute("alias"); CultureInfo cultureInfo = new CultureInfo(Localor.LanguageToString(parentLexicon.Language), false); lexiconItem._text = grapheme; lexiconItem.Grapheme = contentController.IsCaseSensitive ? grapheme.Trim() : grapheme.Trim().ToLower(cultureInfo); lexiconItem.OldGrapheme = lexiconItem.Grapheme; // Check whether this word is reviewed string reviewedValue = wordElement.GetAttribute("r"); if (!string.IsNullOrEmpty(reviewedValue)) { lexiconItem.Reviewed = bool.Parse(reviewedValue); } // Get word's frequency. If there's no such information, set frequency to zero int frequency = 0; int.TryParse(wordElement.GetAttribute("f"), out frequency); lexiconItem.Frequency = frequency; // Load comment lexiconItem.Comment = wordElement.GetAttribute("c"); if (lexiconItem.Status != Lexicon.LexiconStatus.Original) { // Get original word text. string originalWordText = wordElement.GetAttribute("vo"); if (!string.IsNullOrEmpty(originalWordText)) { lexiconItem.OldGrapheme = originalWordText; } } foreach (XmlNode pronNode in wordNode.SelectNodes("tts:p", nsmgr)) { LexiconPronunciation lexPron = LexiconPronunciation.Load(lexiconItem, pronNode, nsmgr, contentController, errorSet); if (lexPron != null) { lexiconItem.Pronunciations.Add(lexPron); } } } } return lexiconItem; }
/// <summary> /// ExtractSubLexicon. /// </summary> /// <param name="corpusType">Corpus type.</param> /// <param name="corpusFile">Corpus file.</param> /// <param name="mainLexicon">Main lexicon.</param> /// <returns>Lexicon.</returns> private Lexicon ExtractSubLexicon(string corpusType, string corpusFile, Lexicon mainLexicon) { List<string> words = null; if (string.Compare(corpusType, "WORDLIST", true) == 0) { words = ExtractWordsFromWordList(corpusFile); } else { throw new InvalidDataException("Unsupported corpus type"); } List<string> missedLexWords = new List<string>(); Lexicon newLex = mainLexicon.ExtractSubLexicon(words, missedLexWords); foreach (string word in missedLexWords) { if (word.IndexOf("[break=") == -1) { string logWord = word.Replace("{", "{{"); logWord = logWord.Replace("}", "}}"); Log("[" + logWord + "] not in main lexicon!"); } } return newLex; }
/// <summary> /// Initializes a new instance of the <see cref="LexiconSearcher"/> class. /// </summary> /// <param name="lex">Host lexicon.</param> public LexiconSearcher(Lexicon lex) { _lex = lex; }
/// <summary> /// Validate domain lexicon, and check whether it only contains one domain tag. /// </summary> /// <param name="domainLex">Domain Lexicon.</param> /// <returns>Whether valid.</returns> private static bool ValidateDomainLexicon(Lexicon domainLex) { Helper.ThrowIfNull(domainLex); bool valid = true; if (string.IsNullOrEmpty(domainLex.DomainTag)) { valid = false; } else { foreach (LexicalItem domainLexItem in domainLex.Items.Values) { if (!domainLexItem.OnlyContainsOneDomain(domainLex.DomainTag)) { valid = false; break; } } } return valid; }
/// <summary> /// Load DomainItem. /// </summary> /// <param name="parentProperty">LexiconItemProperty.</param> /// <param name="domainNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>DomainItem.</returns> internal static DomainItem Load(LexiconItemProperty parentProperty, XmlNode domainNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentProperty != null && parentProperty.Parent != null && parentProperty.Parent.Parent != null && domainNode != null && contentController != null && nsmgr != null); DomainItem domainItem = new DomainItem(); XmlElement domainElem = domainNode as XmlElement; Debug.Assert(domainElem != null); string domainStatusValue = domainElem.GetAttribute("s"); if (!string.IsNullOrEmpty(domainStatusValue)) { domainItem.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), domainStatusValue, true); // Lexicon object is shared with lexicon reviewer tool, // We drop those items if they have "deleted" status when it is not loaded by lexicon reviewer tool if (domainItem.Status == Lexicon.LexiconStatus.Deleted && !contentController.IsHistoryCheckingMode) { domainItem = null; } } if (domainItem != null) { // Check whether pronunciation is prefered in this domain string preferedValue = domainElem.GetAttribute("p"); if (!string.IsNullOrEmpty(preferedValue)) { domainItem.IsFirstPronunciation = bool.Parse(preferedValue); } string domainValue = domainElem.GetAttribute("v"); string originalDomainValue = domainElem.GetAttribute("vo"); if (string.IsNullOrEmpty(domainValue)) { Error error = new Error(DomainError.EmptyDomain); errorSet.Add(LexiconError.DomainError, error, parentProperty.Parent.Parent.Text, parentProperty.Parent.Symbolic); domainItem = null; } else { domainItem.Value = domainValue.ToLower(); if (!string.IsNullOrEmpty(originalDomainValue) && domainItem.Status != Lexicon.LexiconStatus.Original) { domainItem.OldValue = originalDomainValue.ToLower(); } else { domainItem.OldValue = domainValue; } } } return domainItem; }
/// <summary> /// Add domain specified LexicalItem into dictionary. /// </summary> /// <param name="domainLexicons">Dictionary.</param> /// <param name="domainTag">Domain tag.</param> /// <param name="lexItem">LexicalItem.</param> private void FillDomainLexicalItem(Dictionary<string, Lexicon> domainLexicons, string domainTag, LexicalItem lexItem) { Helper.ThrowIfNull(domainLexicons); Helper.ThrowIfNull(domainTag); Helper.ThrowIfNull(lexItem); if (domainLexicons.ContainsKey(domainTag)) { if (!domainLexicons[domainTag].Items.ContainsKey(lexItem.Grapheme)) { domainLexicons[domainTag].Items.Add(lexItem.Grapheme, lexItem); } else { throw new InvalidDataException( string.Format("Duplicate lexicon word \"{0}\" in \"{1}\" domain.", lexItem.Grapheme, domainTag)); } } else { Lexicon newLexicon = new Lexicon(Language); newLexicon.Encoding = Encoding; newLexicon.DomainTag = domainTag; newLexicon.Items.Add(lexItem.Grapheme, lexItem); domainLexicons.Add(domainTag, newLexicon); } }
/// <summary> /// Load AttributeItem from XmlNode. /// </summary> /// <param name="parentProperty">LexiconItemProperty.</param> /// <param name="attributeNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>AttributeItem.</returns> internal static AttributeItem Load(LexiconItemProperty parentProperty, XmlNode attributeNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentProperty != null && parentProperty.Parent != null && parentProperty.Parent.Parent != null && attributeNode != null && contentController != null && nsmgr != null); AttributeItem attributeItem = new AttributeItem(); XmlElement attributeElem = attributeNode as XmlElement; Debug.Assert(attributeElem != null); string attrStatusValue = attributeElem.GetAttribute("s"); if (!string.IsNullOrEmpty(attrStatusValue)) { attributeItem.Status = (Lexicon.LexiconStatus)Enum.Parse( typeof(Lexicon.LexiconStatus), attrStatusValue, true); // Lexicon object is shared with lexicon reviewer tool, // We drop those items if they have "deleted" status when it is not loaded by lexicon reviewer tool if (attributeItem.Status == Lexicon.LexiconStatus.Deleted && !contentController.IsHistoryCheckingMode) { attributeItem = null; } } if (attributeItem != null) { string category = attributeElem.GetAttribute("category"); string value = attributeElem.GetAttribute("value"); string originalValue = attributeElem.GetAttribute("vo"); if (string.IsNullOrEmpty(category)) { Error error = new Error(LexicalAttributeError.EmptyCategory); errorSet.Add(LexiconError.AttributeError, error, parentProperty.Parent.Parent.Text, parentProperty.Parent.Symbolic); attributeItem = null; } else if (string.IsNullOrEmpty(value)) { Error error = new Error(LexicalAttributeError.EmptyValue); errorSet.Add(LexiconError.AttributeError, error, parentProperty.Parent.Parent.Text, parentProperty.Parent.Symbolic); attributeItem = null; } else { attributeItem.Value = value; attributeItem.CategoryName = category; if (!string.IsNullOrEmpty(originalValue) && attributeItem.Status != Lexicon.LexiconStatus.Original) { attributeItem.OldValue = originalValue; } else { attributeItem.OldValue = value; } } } return attributeItem; }
/// <summary> /// Read all lexicon items from XML lexicon file. /// </summary> /// <param name="lexiconFilePath">XML lexicon filepath.</param> /// <returns>Lexicon.</returns> public static Lexicon ReadAllData(string lexiconFilePath) { if (string.IsNullOrEmpty(lexiconFilePath)) { throw new ArgumentNullException("lexiconFilePath"); } Lexicon lexicon = new Lexicon(); lexicon.Load(lexiconFilePath); return lexicon; }
/// <summary> /// Load LexiconItemProperty from XmlNode. /// </summary> /// <param name="parentLexPron">LexiconPronunciation.</param> /// <param name="propertyNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>LexiconItemProperty.</returns> internal static LexiconItemProperty Load(LexiconPronunciation parentLexPron, XmlNode propertyNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentLexPron != null && parentLexPron.Parent != null && propertyNode != null && nsmgr != null && contentController != null && errorSet != null); LexiconItemProperty property = new LexiconItemProperty(); property.Parent = parentLexPron; XmlElement propertyElem = propertyNode as XmlElement; string stateValue = propertyElem.GetAttribute("s"); if (!string.IsNullOrEmpty(stateValue)) { property.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), stateValue, true); } if (!contentController.IsHistoryCheckingMode && property.Status == Lexicon.LexiconStatus.Deleted) { property = null; } else { PosItem posItem = PosItem.Load(propertyNode, nsmgr); if (posItem != null) { property.PartOfSpeech = posItem; } GenderItem genderItem = GenderItem.Load(propertyNode, nsmgr); if (genderItem != null) { property.Gender = genderItem; } CaseItem caseItem = CaseItem.Load(propertyNode, nsmgr); if (caseItem != null) { property.Case = caseItem; } NumberItem numberItem = NumberItem.Load(propertyNode, nsmgr); if (numberItem != null) { property.Number = numberItem; } foreach (XmlNode domainNode in propertyNode.SelectNodes("tts:domain", nsmgr)) { DomainItem domainItem = DomainItem.Load(property, domainNode, nsmgr, contentController, errorSet); if (domainItem != null) { if (!property.Domains.ContainsKey(domainItem.Value)) { property.Domains.Add(domainItem.Value, domainItem); } else { Error error = new Error(DomainError.DuplicateDomain, domainItem.Value); errorSet.Add(LexiconError.DomainError, error, parentLexPron.Parent.Text, parentLexPron.Symbolic); } } } string lexLevelDomain = (parentLexPron.Parent.Parent as Lexicon).DomainTag; if (property.Domains.Count == 0) { if (string.IsNullOrEmpty(lexLevelDomain)) { property.ChangeDomain(new DomainItem()); } else { property.ChangeDomain(new DomainItem(lexLevelDomain)); } } else if (!string.IsNullOrEmpty(lexLevelDomain)) { Error error = new Error(DomainError.InvalidDomainTags); errorSet.Add(LexiconError.DomainError, error, parentLexPron.Parent.Text, parentLexPron.Symbolic); } foreach (XmlNode attributeNode in propertyNode.SelectNodes("tts:attr", nsmgr)) { AttributeItem attributeItem = AttributeItem.Load(property, attributeNode, nsmgr, contentController, errorSet); if (attributeItem != null) { property.AddAttribute(attributeItem); } } } return property; }
/// <summary> /// Extract a sub lexicon from a word list and return those words which not in the main lexicon. /// </summary> /// <param name="words">Words list to extract.</param> /// <param name="missedLexWords">Words that not in the main lexicon.</param> /// <returns>New sub lexicon.</returns> public Lexicon ExtractSubLexicon(List<string> words, List<string> missedLexWords) { Lexicon newLex = new Lexicon(); newLex.Language = Language; newLex.Encoding = Encoding; newLex.PhoneSet = PhoneSet; newLex.PosSet = PosSet; Dictionary<string, object> missedWords = null; if (missedLexWords != null) { missedLexWords.Clear(); missedWords = new Dictionary<string, object>(StringComparer.InvariantCulture); } foreach (string word in words) { if (string.IsNullOrEmpty(word) || newLex.Items.ContainsKey(word)) { continue; } // First do case sensitive lookup; if not found, do case insensitive lookup. LexicalItem wordItem = newLex.Lookup(word); if (wordItem == null) { wordItem = Lookup(word, true); } if (wordItem != null) { newLex.Items.Add(word, wordItem); } else { if (missedWords != null && !missedWords.ContainsKey(word)) { missedWords.Add(word, null); } } } if (missedLexWords != null) { missedLexWords.AddRange(missedWords.Keys); } return newLex; }
/// <summary> /// Load LexiconPronunciation from XmlNode. /// </summary> /// <param name="parentLexItem">LexicalItem.</param> /// <param name="pronNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>LexiconPronunciation.</returns> internal static LexiconPronunciation Load(LexicalItem parentLexItem, XmlNode pronNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentLexItem != null && pronNode != null && nsmgr != null && contentController != null && errorSet != null); LexiconPronunciation lexPron = new LexiconPronunciation(parentLexItem.Language); lexPron.Parent = parentLexItem; XmlElement pronElem = pronNode as XmlElement; Debug.Assert(pronElem != null); string pronStatusValue = pronElem.GetAttribute("s"); if (!string.IsNullOrEmpty(pronStatusValue)) { lexPron.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), pronStatusValue, true); } // Lexicon object is shared with lexicon reviewer tool, // We drop those items if they have "deleted" status when it is not loaded by lexicon reviewer tool if (!contentController.IsHistoryCheckingMode && lexPron.Status == Lexicon.LexiconStatus.Deleted) { lexPron = null; } else { Regex regex = new Regex(@"\s{2,}"); lexPron.Symbolic = pronElem.GetAttribute("v").Trim(); lexPron.Symbolic = regex.Replace(lexPron.Symbolic, " ").ToLowerInvariant(); lexPron.OldSymbolic = lexPron.Symbolic; // Get pronunciation original position. string originalPronPosition = pronElem.GetAttribute("o"); if (!string.IsNullOrEmpty(originalPronPosition)) { lexPron.OldPosition = int.Parse(originalPronPosition, CultureInfo.InvariantCulture); } if (lexPron.Status != Lexicon.LexiconStatus.Original) { string originalPronText = pronElem.GetAttribute("vo"); if (!string.IsNullOrEmpty(originalPronText)) { lexPron.OldSymbolic = originalPronText; } } // Get word's frequency. If there's no such information, set frequency to zero int frequency = 0; int.TryParse(pronElem.GetAttribute("f"), out frequency); lexPron.Frequency = frequency; foreach (XmlNode propertyNode in pronNode.SelectNodes("tts:pr", nsmgr)) { LexiconItemProperty property = LexiconItemProperty.Load(lexPron, propertyNode, nsmgr, contentController, errorSet); if (property != null) { if (contentController.IsHistoryCheckingMode || !lexPron.Properties.Contains(property)) { lexPron.Properties.Add(property); } else { errorSet.Add(LexiconError.DuplicateProperty, parentLexItem.Text, lexPron.Symbolic); } } } } return lexPron; }
public Lexicon[] SplitIntoDomainLexicons(SP.ServiceProvider sp, ErrorSet errorSet) { // Dictionary key="domain tag string", value="Lexicon instance" Dictionary<string, Lexicon> domainLexicons = new Dictionary<string, Lexicon>(); Lexicon generalLexicon = new Lexicon(this.Language); generalLexicon.Encoding = Encoding; generalLexicon.DomainTag = DomainItem.GeneralDomain; domainLexicons.Add(generalLexicon.DomainTag, generalLexicon); foreach (KeyValuePair<string, LexicalItem> pair in this.Items) { Dictionary<string, LexicalItem> domainLexItems = pair.Value.SplitToDomainLexicalItems(); bool same = ArePronsSameForAllDomains(domainLexItems); bool added = false; bool isExpandedWords = false; // check if has pronunciation is expaned. // for chinese/japanese/korean have no expanded words, they needn't to check if is expanded word. if (Language != Language.ZhCN && Language != Language.ZhHK && Language != Language.ZhTW && Language != Language.JaJP && Language != Language.KoKR) { foreach (LexiconPronunciation pron in pair.Value.Pronunciations) { if (LexicalItem.IsExpandedWord(pair.Value.Grapheme, pair.Value.Language, pron, sp)) { isExpandedWords = true; break; } } } // for zh-XX, if there is any english word, the word must not be regularly. if ((Language == Language.ZhCN || Language == Language.ZhTW || Language == Language.ZhHK) && Helper.IsEnglishWord(pair.Value.Grapheme)) { isExpandedWords = true; } // word is not expaneded if (!isExpandedWords) { if (same || domainLexItems.Count == 1) { LexicalItem newLexItem = pair.Value.Clone(); newLexItem.CleanAllDomainTags(); FillDomainLexicalItem(domainLexicons, DomainItem.GeneralDomain, newLexItem); added = true; } else { CheckGeneralPronExist(domainLexItems, errorSet, pair.Value.Grapheme); } } if (!added) { foreach (KeyValuePair<string, LexicalItem> lexItemPair in domainLexItems) { FillDomainLexicalItem(domainLexicons, lexItemPair.Key, lexItemPair.Value); } } } return domainLexicons.Values.ToArray(); }
/// <summary> /// Set status of all domains to specified status /// Change all status if "forceApply" is set as True /// Change the statu only if its status is Original when "forceApply" is set as False. /// </summary> /// <param name="status">LexiconStatus.</param> /// <param name="forceApply">Whether force apply status.</param> public void SetStatusOnAllDomains(Lexicon.LexiconStatus status, bool forceApply) { foreach (LexiconPronunciation pron in _pronunciations) { foreach (LexiconItemProperty property in pron.Properties) { foreach (DomainItem domain in property.Domains.Values) { if (forceApply) { domain.Status = status; } else if (domain.Status == Lexicon.LexiconStatus.Original) { domain.Status = status; } } } } }
/// <summary> /// Check and geneate isolated symbol lexion. /// </summary> /// <param name="chartable">Char table.</param> /// <param name="posSymbol">Pos of symbol.</param> /// <param name="lexiconOutput">Lexicon output.</param> /// <param name="errors">Errors.</param> public void CheckContextualSymbolInLexicon(CharTable chartable, string posSymbol, string lexiconOutput, Collection<string> errors) { if (chartable == null) { throw new ArgumentNullException("chartable"); } if (errors == null) { throw new ArgumentNullException("errors"); } if (posSymbol == null) { throw new ArgumentNullException("posSymbol"); } Lexicon lexicon = new Lexicon(chartable.Language); Collection<string> polyWord = new Collection<string>(); foreach (CharElement charElement in chartable.CharList) { LexicalItem symbolItem = _lexicon.Lookup( charElement.Symbol.ToString(), true); LexicalItem lexiconItem = new LexicalItem(lexicon.Language); LexiconPronunciation lexiconPron = new LexiconPronunciation( lexicon.Language); string pron = string.Empty; string expansion = charElement.ContextualExpansion; if (string.IsNullOrEmpty(expansion)) { continue; } lexiconItem.Grapheme = charElement.Symbol.ToString(); Collection<string> errorStrings = new Collection<string>(); bool hasError = _lexicon.GetPronunciationForWords(expansion, errorStrings, polyWord, ref pron); if (!hasError && !string.IsNullOrEmpty(pron)) { bool addWord = true; if (symbolItem != null) { string[] prons = Pronunciation.SplitIntoPhones(pron); foreach (LexiconPronunciation existPron in symbolItem.Pronunciations) { bool same = true; string[] existProns = Pronunciation.SplitIntoPhones(existPron.Symbolic); if (existProns.Length == prons.Length) { for (int i = 0; i < prons.Length; i++) { if (existProns[i] != prons[i]) { same = false; break; } } if (same) { addWord = false; break; } } } } // add the word if the symbol or pronunicaiton is not in lexicon if (addWord) { lexiconPron.Symbolic = pron; LexiconItemProperty lip = new LexiconItemProperty(); lip.PartOfSpeech = new PosItem(posSymbol); lexiconPron.Properties.Add(lip); lexiconItem.Pronunciations.Add(lexiconPron); lexicon.Items.Add(lexiconItem.Grapheme, lexiconItem); if (symbolItem != null) { errors.Add(AttributeError.SymbolDiffPronFromLex + charElement.Symbol.ToString()); } } else { errors.Add(AttributeError.InfoSymbolInLex + charElement.Symbol.ToString()); } } else { errors.Add(AttributeError.SymbolPronGenError + charElement.Symbol.ToString()); } } Lexicon.WriteAllData(lexiconOutput, lexicon, Encoding.Unicode); }
private ErrorSet CompileLexicon(Stream outputStream) { if (outputStream == null) { throw new ArgumentNullException("outputStream"); } ErrorSet errorSet = new ErrorSet(); ErrorSet subErrorSet = new ErrorSet(); LexicalAttributeSchema schema = (LexicalAttributeSchema)GetObject( RawDataName.LexicalAttributeSchema, subErrorSet); MergeDependencyError(errorSet, subErrorSet, _schemaFullName); subErrorSet.Clear(); TtsPhoneSet phoneSet = (TtsPhoneSet)GetObject(RawDataName.PhoneSet, subErrorSet); MergeDependencyError(errorSet, subErrorSet, RawDataName.PhoneSet); if (!errorSet.Contains(ErrorSeverity.MustFix)) { Microsoft.Tts.Offline.Core.Lexicon lexicon = (Microsoft.Tts.Offline.Core.Lexicon)GetObject(RawDataName.Lexicon, errorSet); errorSet.Merge(lexicon.ErrorSet); // Change to case insensitive lexicon MemoryStream lexiconStream = new MemoryStream(); using (XmlWriter xmlWriter = XmlWriter.Create(lexiconStream)) { Microsoft.Tts.Offline.Core.Lexicon.ContentControler lexiconControler = new Microsoft.Tts.Offline.Core.Lexicon.ContentControler(); lexiconControler.IsCaseSensitive = true; lexicon.Save(xmlWriter, lexiconControler); } lexiconStream.Seek(0, SeekOrigin.Begin); Microsoft.Tts.Offline.Core.Lexicon caseInsensitiveLexicon = new Microsoft.Tts.Offline.Core.Lexicon(); using (StreamReader sr = new StreamReader(lexiconStream)) { caseInsensitiveLexicon.Load(sr); } if (caseInsensitiveLexicon != null && !errorSet.Contains(ErrorSeverity.MustFix)) { caseInsensitiveLexicon.LexicalAttributeSchema = schema; caseInsensitiveLexicon.PhoneSet = phoneSet; caseInsensitiveLexicon.Validate(); // Set severity of errors only in case-insensitive lexicon to NoError for they're not treated as real error caseInsensitiveLexicon.ErrorSet.SetSeverity(ErrorSeverity.NoError); string vendorLexiconPath = Helper.GetTempFileName(); caseInsensitiveLexicon.SaveToVendorLexicon(vendorLexiconPath); string toolFileName = ToolName.BldVendor2; string binaryLexiconPath = Helper.GetTempFileName(); string compilingArguments = Helper.NeutralFormat("-v {0} V2 \"{1}\" \"{2}\" \"{3}\" TTS", (int)_language, _dataHandlerList.Datas[RawDataName.LexicalAttributeSchema].Path, vendorLexiconPath, binaryLexiconPath); string toolPath = Path.Combine(ToolDir, toolFileName); CheckToolExists(toolPath, errorSet); if (!errorSet.Contains(ErrorSeverity.MustFix)) { HandleCommandLine(ModuleDataName.Lexicon, toolPath, compilingArguments, binaryLexiconPath, outputStream, errorSet); } File.Delete(vendorLexiconPath); errorSet.Merge(caseInsensitiveLexicon.ErrorSet); } else if (lexicon == null) { errorSet.Add(DataCompilerError.RawDataError, "Lexicon"); } else { errorSet.Merge(caseInsensitiveLexicon.ErrorSet); } } return errorSet; }