Exemple #1
0
        /// <summary>
        /// Extract a sub lexicon from a word list and return those words which not in the main lexicon.
        /// </summary>
        /// <param name="words">Words list to extract.</param>
        /// <param name="missedLexWords">Words that not in the main lexicon.</param>
        /// <returns>New sub lexicon.</returns>
        public Lexicon ExtractSubLexicon(List<string> words, List<string> missedLexWords)
        {
            Lexicon newLex = new Lexicon();
            newLex.Language = Language;
            newLex.Encoding = Encoding;
            newLex.PhoneSet = PhoneSet;
            newLex.PosSet = PosSet;
            Dictionary<string, object> missedWords = null;
            if (missedLexWords != null)
            {
                missedLexWords.Clear();
                missedWords = new Dictionary<string, object>(StringComparer.InvariantCulture);
            }

            foreach (string word in words)
            {
                if (string.IsNullOrEmpty(word) || newLex.Items.ContainsKey(word))
                {
                    continue;
                }

                // First do case sensitive lookup; if not found, do case insensitive lookup.
                LexicalItem wordItem = newLex.Lookup(word);
                if (wordItem == null)
                {
                    wordItem = Lookup(word, true);
                }

                if (wordItem != null)
                {
                    newLex.Items.Add(word, wordItem);
                }
                else
                {
                    if (missedWords != null && !missedWords.ContainsKey(word))
                    {
                        missedWords.Add(word, null);
                    }
                }
            }

            if (missedLexWords != null)
            {
                missedLexWords.AddRange(missedWords.Keys);
            }

            return newLex;
        }
        /// <summary>
        /// Merge lexicon. when facing conflict word, use the attributes in the first lexicon.
        /// </summary>
        /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param>
        /// <param name="subLexicon">Sub lexicon.</param>
        private void MergeLexiconWithKeepFirstOne(Lexicon mergedLexicon, Lexicon subLexicon)
        {
            Collection<string> skippedPronMessage = new Collection<string>();
            int addedWord = 0;

            foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items)
            {
                string word = subLexiconItem.Key;

                // If the sub lexicon item doesn't exist in merged lexicon, then add it.
                LexicalItem originalItem = mergedLexicon.Lookup(word, true);
                if (originalItem == null)
                {
                    mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value);
                    addedWord++;
                    continue;
                }

                foreach (LexiconPronunciation newPron in subLexiconItem.Value.Pronunciations)
                {
                    if (!originalItem.ContainsPronunciation(newPron.Symbolic))
                    {
                        skippedPronMessage.Add(Helper.NeutralFormat(
                            "Pronunciation for word [{0}] has been skipped: [{1}]",
                            subLexiconItem.Key, newPron.Symbolic));
                    }
                }
            }

            // Log the message
            {
                Log("---------------------------------");
                Log("Totally:");
                Log("[{0}] pronunciations have been skipped.",
                    skippedPronMessage.Count);
                Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", addedWord));
            }
        }
Exemple #3
0
        /// <summary>
        /// Create the lexicon from Xml Script file.
        /// </summary>
        /// <param name="scriptFile">Xml script file.</param>
        /// <param name="defaultPos">Part of Speech String.</param>
        /// <param name="mainLexicon">MainLexicon.</param>
        /// <returns>Lexicon.</returns>
        public static Lexicon CreateFromXmlScriptFile(XmlScriptFile scriptFile, string defaultPos, Lexicon mainLexicon)
        {
            if (scriptFile == null)
            {
                throw new ArgumentNullException("scriptFile");
            }

            if (string.IsNullOrEmpty(defaultPos))
            {
                throw new ArgumentNullException("defaultPos");
            }

            Lexicon lexicon = new Lexicon(scriptFile.Language);
            foreach (ScriptItem item in scriptFile.Items)
            {
                foreach (ScriptWord scriptWord in item.AllPronouncedWords)
                {
                    string word = scriptWord.Grapheme;

                    // Create LexiconPronunciaton Node
                    LexiconPronunciation pron = new LexiconPronunciation(lexicon.Language);
                    pron.Symbolic = scriptWord.Pronunciation;

                    if (mainLexicon != null)
                    {
                        LexicalItem mainLexiconItem = mainLexicon.Lookup(word, true);
                        if (mainLexiconItem != null)
                        {
                            LexiconPronunciation lexPron = mainLexiconItem.FindPronunciation(pron.Symbolic, true);
                            if (lexPron != null)
                            {
                                pron.Symbolic = lexPron.Symbolic;
                            }
                        }
                    }

                    LexiconItemProperty property = new LexiconItemProperty();
                    if (string.IsNullOrEmpty(scriptWord.PosString))
                    {
                        property.PartOfSpeech = new PosItem(defaultPos);
                    }
                    else
                    {
                        property.PartOfSpeech = new PosItem(scriptWord.PosString);
                    }

                    pron.Properties.Add(property);
                    
                    if (!lexicon.Items.ContainsKey(word))
                    {
                        LexicalItem lexicalItem = new LexicalItem(lexicon.Language);
                        lexicalItem.Grapheme = word;
                        lexicalItem.Pronunciations.Add(pron);
                        lexicon.Items.Add(word, lexicalItem);
                    }
                    else
                    {
                        bool needAdd = true;
                        foreach (LexiconPronunciation pronunciation in lexicon.Items[word].Pronunciations)
                        {
                            if (pronunciation.Symbolic.Equals(pron.Symbolic, StringComparison.InvariantCultureIgnoreCase))
                            {
                                needAdd = false;
                                if (!pronunciation.Properties.Contains(property))
                                {
                                    pronunciation.Properties.Add(property);
                                }
                            }
                        }

                        if (needAdd)
                        {
                            lexicon.Items[word].Pronunciations.Add(pron);
                        }
                    }
                }
            }
            
            return lexicon;
        }
        /// <summary>
        /// Merge lexicon, when facing conflict word, use the attributes in the last lexicon.
        /// </summary>
        /// <param name="mergedLexicon">Main lexicon and merged lexicon.</param>
        /// <param name="subLexicon">Sub lexicon.</param>
        private void MergeLexiconWithKeepLastOne(Lexicon mergedLexicon, Lexicon subLexicon)
        {
            Collection<string> replacedPronMessage = new Collection<string>();
            Collection<string> replacedPropertyMessage = new Collection<string>();
            Collection<string> existedWords = new Collection<string>();
            Collection<string> existedWordsInLower = new Collection<string>();

            // Dump the conflict pronunciations or properties from the merged lexicon
            foreach (KeyValuePair<string, LexicalItem> mergedLexiconItem in mergedLexicon.Items)
            {
                string word = mergedLexiconItem.Key;
                LexicalItem newItem = subLexicon.Lookup(word, true);

                // If the sub lexicon contain the same grapheme, then delete the one in original lexicon.
                if (newItem != null)
                {
                    existedWords.Add(word);
                    existedWordsInLower.Add(word.ToLowerInvariant());
                    foreach (LexiconPronunciation originalPron in mergedLexiconItem.Value.Pronunciations)
                    {
                        LexiconPronunciation existedPron = newItem.FindPronunciation(originalPron.Symbolic);
                        if (existedPron == null)
                        {
                            replacedPronMessage.Add(Helper.NeutralFormat(
                                "Pronunciation for word [{0}] has been removed: [{1}]",
                                mergedLexiconItem.Key, originalPron.Symbolic));
                        }
                        else
                        {
                            foreach (LexiconItemProperty subItemProperty in originalPron.Properties)
                            {
                                bool hasProperty = false;

                                // Find old properties in new(sub) item.
                                foreach (LexiconItemProperty itemProperty in existedPron.Properties)
                                {
                                    if (itemProperty.Equals(subItemProperty))
                                    {
                                        hasProperty = true;
                                        break;
                                    }
                                }

                                // Add the property if doesn't contains it.
                                if (!hasProperty)
                                {
                                    replacedPropertyMessage.Add(Helper.NeutralFormat(
                                        "Property has been replaced for word [{0}]'s pronunciation [{1}] : [{2}]",
                                        word, originalPron.Symbolic, subItemProperty.ToString()));
                                }
                            }
                        }
                    }
                }
            }

            // Remove the duplicate word entries
            foreach (string word in existedWords)
            {
                mergedLexicon.Items.Remove(word);
            }

            // Add new word entries into merged lexicon.
            int newWord = 0;
            foreach (KeyValuePair<string, LexicalItem> subLexiconItem in subLexicon.Items)
            {
                mergedLexicon.Items.Add(subLexiconItem.Key, subLexiconItem.Value);
                if (!existedWordsInLower.Contains(subLexiconItem.Key.ToLowerInvariant()))
                {
                    newWord++;
                }
            }

            // Log the Message
            {
                Log("---------------------------------");
                Log("Totally:");
                Log(Helper.NeutralFormat("[{0}] words have been replaced by the latter lexicon", replacedPronMessage.Count));
                Log("[{0}] properties have been replaced.", replacedPropertyMessage.Count);
                Log(Helper.NeutralFormat("[{0}] new words have been added by the latter lexicon", newWord));
            }
        }