/// <summary> /// Extract a sub lexicon from a word list and return those words which not in the main lexicon. /// </summary> /// <param name="words">Words list to extract.</param> /// <param name="missedLexWords">Words that not in the main lexicon.</param> /// <returns>New sub lexicon.</returns> public Lexicon ExtractSubLexicon(List<string> words, List<string> missedLexWords) { Lexicon newLex = new Lexicon(); newLex.Language = Language; newLex.Encoding = Encoding; newLex.PhoneSet = PhoneSet; newLex.PosSet = PosSet; Dictionary<string, object> missedWords = null; if (missedLexWords != null) { missedLexWords.Clear(); missedWords = new Dictionary<string, object>(StringComparer.InvariantCulture); } foreach (string word in words) { if (string.IsNullOrEmpty(word) || newLex.Items.ContainsKey(word)) { continue; } // First do case sensitive lookup; if not found, do case insensitive lookup. LexicalItem wordItem = newLex.Lookup(word); if (wordItem == null) { wordItem = Lookup(word, true); } if (wordItem != null) { newLex.Items.Add(word, wordItem); } else { if (missedWords != null && !missedWords.ContainsKey(word)) { missedWords.Add(word, null); } } } if (missedLexWords != null) { missedLexWords.AddRange(missedWords.Keys); } return newLex; }
/// <summary> /// Merge lexicon. when facing conflict word, use the attributes in the first lexicon. /// </summary> /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param> /// <param name="subLexicon">Sub lexicon.</param> private void MergeLexiconWithKeepFirstOne(Lexicon mergedLexicon, Lexicon subLexicon) { Collection<string> skippedPronMessage = new Collection<string>(); int addedWord = 0; foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items) { string word = subLexiconItem.Key; // If the sub lexicon item doesn't exist in merged lexicon, then add it. LexicalItem originalItem = mergedLexicon.Lookup(word, true); if (originalItem == null) { mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value); addedWord++; continue; } foreach (LexiconPronunciation newPron in subLexiconItem.Value.Pronunciations) { if (!originalItem.ContainsPronunciation(newPron.Symbolic)) { skippedPronMessage.Add(Helper.NeutralFormat( "Pronunciation for word [{0}] has been skipped: [{1}]", subLexiconItem.Key, newPron.Symbolic)); } } } // Log the message { Log("---------------------------------"); Log("Totally:"); Log("[{0}] pronunciations have been skipped.", skippedPronMessage.Count); Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", addedWord)); } }
/// <summary> /// Create the lexicon from Xml Script file. /// </summary> /// <param name="scriptFile">Xml script file.</param> /// <param name="defaultPos">Part of Speech String.</param> /// <param name="mainLexicon">MainLexicon.</param> /// <returns>Lexicon.</returns> public static Lexicon CreateFromXmlScriptFile(XmlScriptFile scriptFile, string defaultPos, Lexicon mainLexicon) { if (scriptFile == null) { throw new ArgumentNullException("scriptFile"); } if (string.IsNullOrEmpty(defaultPos)) { throw new ArgumentNullException("defaultPos"); } Lexicon lexicon = new Lexicon(scriptFile.Language); foreach (ScriptItem item in scriptFile.Items) { foreach (ScriptWord scriptWord in item.AllPronouncedWords) { string word = scriptWord.Grapheme; // Create LexiconPronunciaton Node LexiconPronunciation pron = new LexiconPronunciation(lexicon.Language); pron.Symbolic = scriptWord.Pronunciation; if (mainLexicon != null) { LexicalItem mainLexiconItem = mainLexicon.Lookup(word, true); if (mainLexiconItem != null) { LexiconPronunciation lexPron = mainLexiconItem.FindPronunciation(pron.Symbolic, true); if (lexPron != null) { pron.Symbolic = lexPron.Symbolic; } } } LexiconItemProperty property = new LexiconItemProperty(); if (string.IsNullOrEmpty(scriptWord.PosString)) { property.PartOfSpeech = new PosItem(defaultPos); } else { property.PartOfSpeech = new PosItem(scriptWord.PosString); } pron.Properties.Add(property); if (!lexicon.Items.ContainsKey(word)) { LexicalItem lexicalItem = new LexicalItem(lexicon.Language); lexicalItem.Grapheme = word; lexicalItem.Pronunciations.Add(pron); lexicon.Items.Add(word, lexicalItem); } else { bool needAdd = true; foreach (LexiconPronunciation pronunciation in lexicon.Items[word].Pronunciations) { if (pronunciation.Symbolic.Equals(pron.Symbolic, StringComparison.InvariantCultureIgnoreCase)) { needAdd = false; if (!pronunciation.Properties.Contains(property)) { pronunciation.Properties.Add(property); } } } if (needAdd) { lexicon.Items[word].Pronunciations.Add(pron); } } } } return lexicon; }
/// <summary> /// Merge lexicon, when facing conflict word, use the attributes in the last lexicon. /// </summary> /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param> /// <param name="subLexicon">Sub lexicon.</param> private void MergeLexiconWithKeepLastOne(Lexicon mergedLexicon, Lexicon subLexicon) { Collection<string> replacedPronMessage = new Collection<string>(); Collection<string> replacedPropertyMessage = new Collection<string>(); Collection<string> existedWords = new Collection<string>(); Collection<string> existedWordsInLower = new Collection<string>(); // Dump the conflict pronunciations or properties from the merged lexicon foreach (KeyValuePair<string, LexicalItem> mergedLexiconItem in mergedLexicon.Items) { string word = mergedLexiconItem.Key; LexicalItem newItem = subLexicon.Lookup(word, true); // If the sub lexicon contain the same grapheme, then delete the one in original lexicon. if (newItem != null) { existedWords.Add(word); existedWordsInLower.Add(word.ToLowerInvariant()); foreach (LexiconPronunciation originalPron in mergedLexiconItem.Value.Pronunciations) { LexiconPronunciation existedPron = newItem.FindPronunciation(originalPron.Symbolic); if (existedPron == null) { replacedPronMessage.Add(Helper.NeutralFormat( "Pronunciation for word [{0}] has been removed: [{1}]", mergedLexiconItem.Key, originalPron.Symbolic)); } else { foreach (LexiconItemProperty subItemProperty in originalPron.Properties) { bool hasProperty = false; // Find old properties in new(sub) item. foreach (LexiconItemProperty itemProperty in existedPron.Properties) { if (itemProperty.Equals(subItemProperty)) { hasProperty = true; break; } } // Add the property if doesn't contains it. if (!hasProperty) { replacedPropertyMessage.Add(Helper.NeutralFormat( "Property has been replaced for word [{0}]'s pronunciation [{1}] : [{2}]", word, originalPron.Symbolic, subItemProperty.ToString())); } } } } } } // Remove the duplicate word entries foreach (string word in existedWords) { mergedLexicon.Items.Remove(word); } // Add new word entries into merged lexicon. int newWord = 0; foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items) { mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value); if (!existedWordsInLower.Contains(subLexiconItem.Key.ToLowerInvariant())) { newWord++; } } // Log the Message { Log("---------------------------------"); Log("Totally:"); Log(Helper.NeutralFormat("[{0}] words have been replaced by the latter lexicon", replacedPronMessage.Count)); Log("[{0}] properties have been replaced.", replacedPropertyMessage.Count); Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", newWord)); } }