/// <summary> /// Add a word to tree /// </summary> /// <param name="word">Word</param> /// <param name="freq">Word's usage frequency</param> /// <param name="posTag">Word's pos</param> /// <param name="fileName">File name</param> public bool AddWord(string word, int freq, PersianPOSTag posTag, string fileName) { try { int existingFreq; PersianPOSTag existingPOS; if (this.Contain(word, out existingFreq, out existingPOS)) { if (existingPOS.Has(posTag) && existingFreq == freq) { return(false); } else { RemoveFromFile(word); } } AddWordToMemory(word, freq, posTag.ToString()); return(AddWordToFile(word, freq, posTag.ToString(), fileName)); } catch (Exception ex) { return(false); } }
/// <summary> /// Add a word to tree /// </summary> /// <param name="word">Word</param> /// <param name="freq">Word's usage frequency</param> /// <param name="posTag">Word's pos</param> public bool AddWordBlind(string word, int freq, PersianPOSTag posTag) { try { AddWordToMemory(word, freq, posTag.ToString()); return(true); } catch (Exception ex) { throw ex; } }
public Token(string lexeme, PersianPOSTag persianPOSTag, string lemma, NumberType numberType, int length, int startPos, ENUM_TENSE_PERSON person) { Lexeme = lexeme; POSTag = persianPOSTag; Lemma = lemma; Number = numberType; Length = length; Lemma = lemma; Lexeme = lexeme; Person = person; POSTag = persianPOSTag; StartPos = startPos; }
/// <summary> /// Check if a word exists /// </summary> /// <param name="word">Word</param> /// <param name="posTag">Word's POS tag</param> /// <returns>If the dictionary contains the word, returns true, else returns false.</returns> public bool Contain(string word, out PersianPOSTag posTag) { posTag = PersianPOSTag.UserPOS; NodeWithFreqandPOS leaf = IndexOf(word); if (leaf == null || !leaf.IsEndOfWord) { return(false); } posTag = leaf.POSTag.ToEnum <PersianPOSTag>(); return(true); }
/// <summary> /// Add a correct word to dictionary /// </summary> /// <param name="userSelectedWord">Form of word which user select to add to dictionary</param> /// <param name="originalWord">Original word without lemmatization</param> ///<returns>True if word is successfully added, otherwise False</returns> private void AddToDictionary(string userSelectedWord, string originalWord) { string suffix = originalWord.Remove(0, userSelectedWord.Length); PersianPOSTag extractedPOSTag = PersianPOSTag.UserPOS; if (suffix.Length > 0) { PersianSuffixesCategory suffixCategory = InflectionAnalyser.SuffixCategory(suffix); extractedPOSTag = InflectionAnalyser.AcceptingPOS(suffixCategory); extractedPOSTag = extractedPOSTag.Set(PersianPOSTag.UserPOS); } AddWordToFinalList(userSelectedWord, m_wordList[userSelectedWord], extractedPOSTag); }
private void AddWordToFinalList(string word, int freq, PersianPOSTag pos) { FreqPOSPair pair; if (m_finalList.ContainsKey(word)) { pair.freq = m_finalList[word].freq + 1; pair.pos = m_finalList[word].pos.Set(pos); m_finalList[word] = pair; } else { pair.freq = freq; pair.pos = pos; m_finalList.Add(word, pair); } }
private static PersianPOSTag GetMostFrequent(PersianPOSTag possibletags) { // In order of frequency if ((possibletags & PersianPOSTag.N) == PersianPOSTag.N) { return(PersianPOSTag.N); } if ((possibletags & PersianPOSTag.P) == PersianPOSTag.P) { return(PersianPOSTag.P); } if ((possibletags & PersianPOSTag.PUNC) == PersianPOSTag.PUNC) { return(PersianPOSTag.PUNC); } if ((possibletags & PersianPOSTag.V) == PersianPOSTag.V) { return(PersianPOSTag.V); } if ((possibletags & PersianPOSTag.AJ) == PersianPOSTag.AJ) { return(PersianPOSTag.AJ); } if ((possibletags & PersianPOSTag.CONJ) == PersianPOSTag.CONJ) { return(PersianPOSTag.CONJ); } if ((possibletags & PersianPOSTag.NUM) == PersianPOSTag.NUM) { return(PersianPOSTag.NUM); } if ((possibletags & PersianPOSTag.NUM) == PersianPOSTag.NUM) { return(PersianPOSTag.NUM); } return(PersianPOSTag.UserPOS); }
// In fact, it's just a simple stemmer, not a _lemmatizer private string GetLemma(string token, out PersianPOSTag posTag) { string lemma = token; if (m_LemmaDic.ContainsKey(token)) { KeyValuePair <string, PersianPOSTag> valuePair = m_LemmaDic[token]; posTag = valuePair.Value; return(valuePair.Key); } posTag = PersianPOSTag.UserPOS; var rpmpis = _lemmatizer.MatchForSuffix(token); for (int index = rpmpis.Length - 1; index >= 0; index--) { var rpmpi = rpmpis[index]; if (Mapper.ContainsKey(rpmpi.BaseWord)) { lemma = rpmpi.BaseWord; // Manual rules go here, probably from a seprate fromatted file if (Mapper[rpmpi.BaseWord] == PersianPOSTag.N && rpmpi.Suffix == "ی") { posTag = PersianPOSTag.AJ; break; } posTag = Mapper[rpmpi.BaseWord]; break; } PersianPOSTag possibletags = _lemmatizer.AcceptingPOS(_lemmatizer.SuffixCategory(rpmpi.Suffix)); posTag = GetMostFrequent(possibletags); } m_LemmaDic.Add(token, new KeyValuePair <string, PersianPOSTag>(lemma, posTag)); return(lemma); }
public Token(string lexeme, PersianPOSTag persianPOSTag, string lemma) { Lexeme = lexeme; POSTag = persianPOSTag; Lemma = lemma; }
// In fact, it's just a simple stemmer, not a _lemmatizer private string GetLemma(string token, out PersianPOSTag posTag) { string lemma = token; if (m_LemmaDic.ContainsKey(token)) { KeyValuePair<string, PersianPOSTag> valuePair = m_LemmaDic[token]; posTag = valuePair.Value; return valuePair.Key; } posTag = PersianPOSTag.UserPOS; var rpmpis = _lemmatizer.MatchForSuffix(token); for (int index = rpmpis.Length - 1; index >= 0; index--) { var rpmpi = rpmpis[index]; if (Mapper.ContainsKey(rpmpi.BaseWord)) { lemma = rpmpi.BaseWord; // Manual rules go here, probably from a seprate fromatted file if (Mapper[rpmpi.BaseWord] == PersianPOSTag.N && rpmpi.Suffix == "ی") { posTag = PersianPOSTag.AJ; break; } posTag = Mapper[rpmpi.BaseWord]; break; } PersianPOSTag possibletags = _lemmatizer.AcceptingPOS(_lemmatizer.SuffixCategory(rpmpi.Suffix)); posTag = GetMostFrequent(possibletags); } m_LemmaDic.Add(token, new KeyValuePair<string, PersianPOSTag>(lemma, posTag)); return lemma; }
private static PersianPOSTag GetMostFrequent(PersianPOSTag possibletags) { // In order of frequency if ((possibletags & PersianPOSTag.N) == PersianPOSTag.N) return PersianPOSTag.N; if ((possibletags & PersianPOSTag.P) == PersianPOSTag.P) return PersianPOSTag.P; if ((possibletags & PersianPOSTag.PUNC) == PersianPOSTag.PUNC) return PersianPOSTag.PUNC; if ((possibletags & PersianPOSTag.V) == PersianPOSTag.V) return PersianPOSTag.V; if ((possibletags & PersianPOSTag.AJ) == PersianPOSTag.AJ) return PersianPOSTag.AJ; if ((possibletags & PersianPOSTag.CONJ) == PersianPOSTag.CONJ) return PersianPOSTag.CONJ; if ((possibletags & PersianPOSTag.NUM) == PersianPOSTag.NUM) return PersianPOSTag.NUM; if ((possibletags & PersianPOSTag.NUM) == PersianPOSTag.NUM) return PersianPOSTag.NUM; return PersianPOSTag.UserPOS; }