/// <summary> /// Check and geneate isolated symbol lexion. /// </summary> /// <param name="chartable">Char table.</param> /// <param name="posSymbol">Pos of symbol.</param> /// <param name="lexiconOutput">Lexicon output.</param> /// <param name="errors">Errors.</param> public void CheckContextualSymbolInLexicon(CharTable chartable, string posSymbol, string lexiconOutput, Collection<string> errors) { if (chartable == null) { throw new ArgumentNullException("chartable"); } if (errors == null) { throw new ArgumentNullException("errors"); } if (posSymbol == null) { throw new ArgumentNullException("posSymbol"); } Lexicon lexicon = new Lexicon(chartable.Language); Collection<string> polyWord = new Collection<string>(); foreach (CharElement charElement in chartable.CharList) { LexicalItem symbolItem = _lexicon.Lookup( charElement.Symbol.ToString(), true); LexicalItem lexiconItem = new LexicalItem(lexicon.Language); LexiconPronunciation lexiconPron = new LexiconPronunciation( lexicon.Language); string pron = string.Empty; string expansion = charElement.ContextualExpansion; if (string.IsNullOrEmpty(expansion)) { continue; } lexiconItem.Grapheme = charElement.Symbol.ToString(); Collection<string> errorStrings = new Collection<string>(); bool hasError = _lexicon.GetPronunciationForWords(expansion, errorStrings, polyWord, ref pron); if (!hasError && !string.IsNullOrEmpty(pron)) { bool addWord = true; if (symbolItem != null) { string[] prons = Pronunciation.SplitIntoPhones(pron); foreach (LexiconPronunciation existPron in symbolItem.Pronunciations) { bool same = true; string[] existProns = Pronunciation.SplitIntoPhones(existPron.Symbolic); if (existProns.Length == prons.Length) { for (int i = 0; i < prons.Length; i++) { if (existProns[i] != prons[i]) { same = false; break; } } if (same) { addWord = false; break; } } } } // add the word if the symbol or pronunicaiton is not in lexicon if (addWord) { lexiconPron.Symbolic = pron; LexiconItemProperty lip = new LexiconItemProperty(); lip.PartOfSpeech = new PosItem(posSymbol); lexiconPron.Properties.Add(lip); lexiconItem.Pronunciations.Add(lexiconPron); lexicon.Items.Add(lexiconItem.Grapheme, lexiconItem); if (symbolItem != null) { errors.Add(AttributeError.SymbolDiffPronFromLex + charElement.Symbol.ToString()); } } else { errors.Add(AttributeError.InfoSymbolInLex + charElement.Symbol.ToString()); } } else { errors.Add(AttributeError.SymbolPronGenError + charElement.Symbol.ToString()); } } Lexicon.WriteAllData(lexiconOutput, lexicon, Encoding.Unicode); }
public LexicalItem Lookup(string grapheme, bool ignoreCase) { if (string.IsNullOrEmpty(grapheme)) { throw new ArgumentNullException("grapheme"); } LexicalItem foundItem = null; if (!ignoreCase) { foundItem = Lookup(grapheme); } else { foundItem = new LexicalItem(this.Language); foundItem.Grapheme = grapheme; foundItem.AddRange(Lookup(grapheme)); CultureInfo cultureInfo = CultureInfo.GetCultureInfo(Localor.LanguageToString(this.Language)); string word = grapheme.ToLower(cultureInfo); if (!word.Equals(grapheme, StringComparison.Ordinal)) { foundItem.AddRange(Lookup(word)); } LexicalItem newItem = Lookup(grapheme.ToUpper(cultureInfo)); foundItem.AddRange(newItem); if (grapheme.Length > 1) { string pascalWord = string.Concat(grapheme.Substring(0, 1).ToUpper(cultureInfo), grapheme.Substring(1).ToLower(cultureInfo)); newItem = Lookup(pascalWord); foundItem.AddRange(newItem); } foundItem.MergeDuplicatePronunciation(); if (foundItem.Pronunciations.Count == 0) { foundItem = null; } } return foundItem; }
/// <summary> /// Look up word from online dictionary . /// </summary> /// <param name="word">Word parameter .</param> /// <param name="isOOV">If the word is OOV.</param> /// <returns>Return a LexicalItem for each word .</returns> public LexicalItem Lookup(string word, ref bool isOOV) { _item = new LexicalItem(_language); _word = word.ToLower(); _item.Grapheme = _word; bool flag = false; Dictionary<string, Collection<string>> prons = new Dictionary<string, Collection<string>>(); isOOV = false; foreach (DictionaryModel dic in _dics) { Collection<string> pronunciations = dic.Lookup(word); isOOV = dic.IsOOV; if (!dic.IsOOV) { if (string.IsNullOrEmpty(dic.Word)) { throw new InvalidDataException(Helper.NeutralFormat( "Error happened when processing word \"{0}\": The word parsed from web is empty", word)); } else if (dic.Pronunciations.Count == 0) { throw new InvalidDataException(Helper.NeutralFormat( "Error happened when processing word \"{0}\": The pronunciation parsed from web is empty", word)); } else { DicItem item = new DicItem(); item.Word = dic.Word; item.POS = dic.POS; item.Pronunciations = dic.Pronunciations; _dicItems.Add(item); if (word.Equals(dic.Word, StringComparison.InvariantCultureIgnoreCase)) { foreach (string pron in dic.Pronunciations) { if (!prons.Keys.Contains(pron)) { Collection<string> pos = new Collection<string>(); pos.Add(dic.POS); prons.Add(pron, pos); } else { if (!prons[pron].Contains(dic.POS)) { prons[pron].Add(dic.POS); } } } } else { flag = true; } } } dic.Reset(); } GenerateLexicalItem(prons); _isMorphology = flag; return Item; }
/// <summary> /// Add domain specified LexicalItem into dictionary. /// </summary> /// <param name="domainLexicons">Dictionary.</param> /// <param name="domainTag">Domain tag.</param> /// <param name="lexItem">LexicalItem.</param> private void FillDomainLexicalItem(Dictionary<string, Lexicon> domainLexicons, string domainTag, LexicalItem lexItem) { Helper.ThrowIfNull(domainLexicons); Helper.ThrowIfNull(domainTag); Helper.ThrowIfNull(lexItem); if (domainLexicons.ContainsKey(domainTag)) { if (!domainLexicons[domainTag].Items.ContainsKey(lexItem.Grapheme)) { domainLexicons[domainTag].Items.Add(lexItem.Grapheme, lexItem); } else { throw new InvalidDataException( string.Format("Duplicate lexicon word \"{0}\" in \"{1}\" domain.", lexItem.Grapheme, domainTag)); } } else { Lexicon newLexicon = new Lexicon(Language); newLexicon.Encoding = Encoding; newLexicon.DomainTag = domainTag; newLexicon.Items.Add(lexItem.Grapheme, lexItem); domainLexicons.Add(domainTag, newLexicon); } }
/// <summary> /// Create the lexicon from Xml Script file. /// </summary> /// <param name="scriptFile">Xml script file.</param> /// <param name="defaultPos">Part of Speech String.</param> /// <param name="mainLexicon">MainLexicon.</param> /// <returns>Lexicon.</returns> public static Lexicon CreateFromXmlScriptFile(XmlScriptFile scriptFile, string defaultPos, Lexicon mainLexicon) { if (scriptFile == null) { throw new ArgumentNullException("scriptFile"); } if (string.IsNullOrEmpty(defaultPos)) { throw new ArgumentNullException("defaultPos"); } Lexicon lexicon = new Lexicon(scriptFile.Language); foreach (ScriptItem item in scriptFile.Items) { foreach (ScriptWord scriptWord in item.AllPronouncedWords) { string word = scriptWord.Grapheme; // Create LexiconPronunciaton Node LexiconPronunciation pron = new LexiconPronunciation(lexicon.Language); pron.Symbolic = scriptWord.Pronunciation; if (mainLexicon != null) { LexicalItem mainLexiconItem = mainLexicon.Lookup(word, true); if (mainLexiconItem != null) { LexiconPronunciation lexPron = mainLexiconItem.FindPronunciation(pron.Symbolic, true); if (lexPron != null) { pron.Symbolic = lexPron.Symbolic; } } } LexiconItemProperty property = new LexiconItemProperty(); if (string.IsNullOrEmpty(scriptWord.PosString)) { property.PartOfSpeech = new PosItem(defaultPos); } else { property.PartOfSpeech = new PosItem(scriptWord.PosString); } pron.Properties.Add(property); if (!lexicon.Items.ContainsKey(word)) { LexicalItem lexicalItem = new LexicalItem(lexicon.Language); lexicalItem.Grapheme = word; lexicalItem.Pronunciations.Add(pron); lexicon.Items.Add(word, lexicalItem); } else { bool needAdd = true; foreach (LexiconPronunciation pronunciation in lexicon.Items[word].Pronunciations) { if (pronunciation.Symbolic.Equals(pron.Symbolic, StringComparison.InvariantCultureIgnoreCase)) { needAdd = false; if (!pronunciation.Properties.Contains(property)) { pronunciation.Properties.Add(property); } } } if (needAdd) { lexicon.Items[word].Pronunciations.Add(pron); } } } } return lexicon; }
/// <summary> /// Load LexicalItem from XmlNode. /// </summary> /// <param name="parentLexicon">Lexicon.</param> /// <param name="wordNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>LexicalItem.</returns> internal static LexicalItem Load(Lexicon parentLexicon, XmlNode wordNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentLexicon != null && wordNode != null && nsmgr != null && contentController != null && errorSet != null); XmlElement wordElement = wordNode as XmlElement; LexicalItem lexiconItem = new LexicalItem(parentLexicon.Language); lexiconItem.Parent = parentLexicon; string grapheme = wordElement.GetAttribute("v"); if (string.IsNullOrEmpty(grapheme)) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.EmptyWord), grapheme); lexiconItem = null; } else if (!grapheme.Trim().Equals(grapheme, StringComparison.OrdinalIgnoreCase)) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.LeadingOrTrailingSpace), grapheme); lexiconItem = null; } else { Regex regex = new Regex("( )|\t"); if (regex.IsMatch(grapheme.Trim())) { errorSet.Add(LexiconError.InvalidWordEntry, new Error(WordEntryError.ContainingTabOrMultipleSpaces), grapheme); } } if (lexiconItem != null) { // Before share lexicon object to lexicon reviewer tool, // we drop those items if they have "deleted" status string statusValue = wordElement.GetAttribute("s"); if (!string.IsNullOrEmpty(statusValue)) { lexiconItem.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), statusValue, true); } if (!contentController.IsHistoryCheckingMode && lexiconItem.Status == Lexicon.LexiconStatus.Deleted) { lexiconItem = null; } else { lexiconItem.Alias = wordElement.GetAttribute("alias"); CultureInfo cultureInfo = new CultureInfo(Localor.LanguageToString(parentLexicon.Language), false); lexiconItem._text = grapheme; lexiconItem.Grapheme = contentController.IsCaseSensitive ? grapheme.Trim() : grapheme.Trim().ToLower(cultureInfo); lexiconItem.OldGrapheme = lexiconItem.Grapheme; // Check whether this word is reviewed string reviewedValue = wordElement.GetAttribute("r"); if (!string.IsNullOrEmpty(reviewedValue)) { lexiconItem.Reviewed = bool.Parse(reviewedValue); } // Get word's frequency. If there's no such information, set frequency to zero int frequency = 0; int.TryParse(wordElement.GetAttribute("f"), out frequency); lexiconItem.Frequency = frequency; // Load comment lexiconItem.Comment = wordElement.GetAttribute("c"); if (lexiconItem.Status != Lexicon.LexiconStatus.Original) { // Get original word text. string originalWordText = wordElement.GetAttribute("vo"); if (!string.IsNullOrEmpty(originalWordText)) { lexiconItem.OldGrapheme = originalWordText; } } foreach (XmlNode pronNode in wordNode.SelectNodes("tts:p", nsmgr)) { LexiconPronunciation lexPron = LexiconPronunciation.Load(lexiconItem, pronNode, nsmgr, contentController, errorSet); if (lexPron != null) { lexiconItem.Pronunciations.Add(lexPron); } } } } return lexiconItem; }
/// <summary> /// Save pronunciations. /// </summary> /// <param name="tw">Text writer.</param> /// <param name="graphme">Word graphme.</param> /// <param name="lexItem">Lexicon items.</param> private void SavePronunciation(TextWriter tw, string graphme, LexicalItem lexItem) { foreach (LexiconPronunciation lexPron in lexItem.Pronunciations) { if (!lexPron.Valid) { this.ErrorSet.Add(LexiconCompilerError.RemoveInvalidPronunciation, graphme, lexPron.Symbolic); continue; } ErrorSet phoneConvertErrorSet = new ErrorSet(); string hexIds = Pronunciation.ConvertIntoHexIds(lexPron.Symbolic, _ttsPhoneSet, phoneConvertErrorSet); if (string.IsNullOrEmpty(hexIds)) { continue; } string firstHalf = Helper.NeutralFormat("{0}\t{1}\t{2}", graphme, hexIds, "sppos=noncontent"); Collection<string> attributeStringList = new Collection<string>(); Debug.Assert(lexPron.Symbolic != null); Debug.Assert(lexPron.Properties != null && lexPron.Properties.Count > 0); SaveProperty(graphme, lexPron, attributeStringList); foreach (string attributeString in attributeStringList) { tw.WriteLine(firstHalf + attributeString); } } }
/// <summary> /// Add the pronunciations for new item into original item. /// </summary> /// <param name="newItem">New item.</param> public void AddRange(LexicalItem newItem) { if (newItem != null) { foreach (LexiconPronunciation pronunciation in newItem.Pronunciations) { this.Pronunciations.Add(pronunciation); } } }
/// <summary> /// Clone current word. /// </summary> /// <returns>Cloned word.</returns> public LexicalItem Clone() { LexicalItem clonedWord = new LexicalItem(); clonedWord.Alias = _alias; clonedWord.Comment = _comment; clonedWord.Frequency = _frequency; clonedWord.Language = _language; clonedWord.LexiconType = _lexiconType; clonedWord.Grapheme = _grapheme; clonedWord.OldGrapheme = _oldGrapheme; clonedWord.Reviewed = Reviewed; clonedWord.Status = Status; clonedWord._text = _text; clonedWord.Valid = _valid; clonedWord.Parent = Parent; foreach (LexiconPronunciation pron in _pronunciations) { LexiconPronunciation clonedPron = pron.Clone(); clonedWord.Pronunciations.Add(clonedPron); clonedPron.Parent = clonedWord; } return clonedWord; }
/// <summary> /// Import domain LexicalItem. /// </summary> /// <param name="domainLexItem">LexicalItem.</param> /// <param name="domainTag">Domain tag.</param> /// <param name="trustDomainLexicon">Whether domain lexion is trusting.</param> /// <returns>ErrorSet.</returns> public ErrorSet ImportDomainLexicalItem(LexicalItem domainLexItem, string domainTag, bool trustDomainLexicon) { Helper.ThrowIfNull(domainLexItem); Helper.ThrowIfNull(domainTag); if (!domainLexItem.OnlyContainsOneDomain(domainTag)) { throw new InvalidDataException("It is invalid to include any other domain in property level."); } bool imported = false; ErrorSet importError = new ErrorSet(); bool isFirstPron = true; // Needn't set p="true" tag for general domain, // because the order of pronunciation means the priority of general domain. domainTag = domainTag.ToLower(CultureInfo.InvariantCulture); if (domainTag.Equals(DomainItem.GeneralDomain)) { isFirstPron = false; } if (!IsReviewedDomain(domainTag)) { foreach (LexiconPronunciation domainPron in domainLexItem.Pronunciations) { LexiconPronunciation duplicatePron = FindLexiconPronunciation(domainPron.Symbolic, false); if (duplicatePron == null) { duplicatePron = FindLexiconPronunciation(domainPron.Symbolic, true); } if (duplicatePron != null) { if (duplicatePron.ImportDomainPronunciation(domainPron, domainTag, isFirstPron)) { imported = true; } isFirstPron = false; } else { if (trustDomainLexicon) { // if trustDomainLexicon is true, add new pronunication to lexicon item. _pronunciations.Add(domainPron); } else { importError.Add(LexiconError.NewDomainPronunciation, domainTag, domainLexItem.Grapheme, domainPron.Symbolic); } } } } if (imported) { Status = Lexicon.LexiconStatus.Original; Reviewed = false; } return importError; }
/// <summary> /// Compare objects that derived from LexicalItem. /// </summary> /// <param name="obj1">Object 1.</param> /// <param name="obj2">Object 2.</param> /// <returns>true for equal /// (null, null) => equal /// (null, deleted) => equal /// (deleted, deleted) => equal. /// </returns> public static bool Equals(LexicalItem obj1, LexicalItem obj2) { if (obj1 == obj2) { return true; } if ((obj1 == null || obj1.Status == Lexicon.LexiconStatus.Deleted) && (obj2 == null || obj2.Status == Lexicon.LexiconStatus.Deleted)) { return true; } if (obj1.Alias != obj2.Alias || obj1.Grapheme != obj2.Grapheme || obj1.Language != obj2.Language || obj1.LexiconType != obj2.LexiconType || obj1.Polyphonic != obj2.Polyphonic || obj1.Status != obj2.Status || obj1.Text != obj2.Text) { return false; } Collection<LexiconPronunciation> prons1 = obj1.Pronunciations; Collection<LexiconPronunciation> prons2 = obj2.Pronunciations; int pronsLength = prons1.Count; if (pronsLength != prons2.Count) { return false; } for (int i = 0; i < pronsLength; i++) { if (!LexiconPronunciation.Equals(prons1[i], prons2[i])) { return false; } } return true; }
/// <summary> /// Load LexiconPronunciation from XmlNode. /// </summary> /// <param name="parentLexItem">LexicalItem.</param> /// <param name="pronNode">XmlNode.</param> /// <param name="nsmgr">XmlNamespaceManager.</param> /// <param name="contentController">Object.</param> /// <param name="errorSet">ErrorSet.</param> /// <returns>LexiconPronunciation.</returns> internal static LexiconPronunciation Load(LexicalItem parentLexItem, XmlNode pronNode, XmlNamespaceManager nsmgr, Lexicon.ContentControler contentController, ErrorSet errorSet) { Debug.Assert(parentLexItem != null && pronNode != null && nsmgr != null && contentController != null && errorSet != null); LexiconPronunciation lexPron = new LexiconPronunciation(parentLexItem.Language); lexPron.Parent = parentLexItem; XmlElement pronElem = pronNode as XmlElement; Debug.Assert(pronElem != null); string pronStatusValue = pronElem.GetAttribute("s"); if (!string.IsNullOrEmpty(pronStatusValue)) { lexPron.Status = (Lexicon.LexiconStatus)Enum.Parse(typeof(Lexicon.LexiconStatus), pronStatusValue, true); } // Lexicon object is shared with lexicon reviewer tool, // We drop those items if they have "deleted" status when it is not loaded by lexicon reviewer tool if (!contentController.IsHistoryCheckingMode && lexPron.Status == Lexicon.LexiconStatus.Deleted) { lexPron = null; } else { Regex regex = new Regex(@"\s{2,}"); lexPron.Symbolic = pronElem.GetAttribute("v").Trim(); lexPron.Symbolic = regex.Replace(lexPron.Symbolic, " ").ToLowerInvariant(); lexPron.OldSymbolic = lexPron.Symbolic; // Get pronunciation original position. string originalPronPosition = pronElem.GetAttribute("o"); if (!string.IsNullOrEmpty(originalPronPosition)) { lexPron.OldPosition = int.Parse(originalPronPosition, CultureInfo.InvariantCulture); } if (lexPron.Status != Lexicon.LexiconStatus.Original) { string originalPronText = pronElem.GetAttribute("vo"); if (!string.IsNullOrEmpty(originalPronText)) { lexPron.OldSymbolic = originalPronText; } } // Get word's frequency. If there's no such information, set frequency to zero int frequency = 0; int.TryParse(pronElem.GetAttribute("f"), out frequency); lexPron.Frequency = frequency; foreach (XmlNode propertyNode in pronNode.SelectNodes("tts:pr", nsmgr)) { LexiconItemProperty property = LexiconItemProperty.Load(lexPron, propertyNode, nsmgr, contentController, errorSet); if (property != null) { if (contentController.IsHistoryCheckingMode || !lexPron.Properties.Contains(property)) { lexPron.Properties.Add(property); } else { errorSet.Add(LexiconError.DuplicateProperty, parentLexItem.Text, lexPron.Symbolic); } } } } return lexPron; }