public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts) { Stemmer stemmer = new Stemmer(); this._words = new HashSet <string>(); foreach (var line in File.ReadLines(wordsFile)) { this._words.Add(line.Trim()); } WordTokenizer tokenizer = new WordTokenizer(verbsFile); var pureVerbs = new List <string>(File.ReadAllLines(verbsFile).Reverse()); this._verbs = new Hashtable(); this._verbs.Add("است", "#است"); pureVerbs.ForEach(verb => { Conjugations(verb).ForEach(tense => { if (!this._verbs.ContainsKey(tense)) { this._verbs.Add(tense, verb); } }); }); if (joinedVerbParts) { pureVerbs.ForEach(verb => { var bon = verb.Split('#')[0]; tokenizer.AfterVerbs.ToList().ForEach(afterVerb => { this._verbs.Add(bon + "ه " + afterVerb, verb); this._verbs.Add("ن" + bon + "ه " + afterVerb, verb); }); tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb => { this._verbs.Add(beforeVerb + " " + bon, verb); }); }); } }
public string Lemmatize(string word, string pos) { if ((pos.Length == 0 || pos.Equals("V")) && this._verbs.ContainsKey(word)) { return(this._verbs[word].ToString()); } if (this._words.Contains(word)) { return(word); } var stem = new Stemmer().Stem(word); if (this._words.Contains(stem)) { return(stem); } return(word); }
public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts) { Stemmer stemmer = new Stemmer(); this._words = new HashSet<string>(); foreach (var line in File.ReadLines(wordsFile)) this._words.Add(line.Trim()); WordTokenizer tokenizer = new WordTokenizer(verbsFile); var pureVerbs = new List<string>(File.ReadAllLines(verbsFile).Reverse()); this._verbs = new Hashtable(); this._verbs.Add("است", "#است"); pureVerbs.ForEach(verb => { Conjugations(verb).ForEach(tense => { if (!this._verbs.ContainsKey(tense)) this._verbs.Add(tense, verb); }); }); if (joinedVerbParts) { pureVerbs.ForEach(verb => { string[] parts = verb.Split('#'); tokenizer.AfterVerbs.ToList().ForEach(afterVerb => { this._verbs.Add(parts[0] + "ه " + afterVerb, verb); }); tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb => { this._verbs.Add(beforeVerb + " " + parts[0], verb); }); }); } }
public string Lemmatize(string word, string pos) { if ((pos.Length == 0 || pos.Equals("V")) && this._verbs.ContainsKey(word)) return this._verbs[word].ToString(); if (this._words.Contains(word)) return word; var stem = new Stemmer().Stem(word); if (this._words.Contains(stem)) return stem; return word; }