/// <summary> /// Gets list of raw sentences /// </summary> public IEnumerable <ConcurrentDependencyGraph> RawParse(List <string> sentences) { foreach (var sentence in sentences) { var words = WordTokenizer.Tokenize(sentence); yield return(RawParse(Tagger.BatchTag(words))); } }
public BijankhanReader(string bijankhanFile, bool joinedVerbParts, string posMap) { this._bijankhanFile = bijankhanFile; this._joinedVerbParts = joinedVerbParts; this._posMap = posMap; this.normalizer = new Normalizer(true, false, true); this.tokenizer = new WordTokenizer(); }
public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts) { Stemmer stemmer = new Stemmer(); this._words = new HashSet <string>(); foreach (var line in File.ReadLines(wordsFile)) { this._words.Add(line.Trim()); } WordTokenizer tokenizer = new WordTokenizer(verbsFile); var pureVerbs = new List <string>(File.ReadAllLines(verbsFile).Reverse()); this._verbs = new Hashtable(); this._verbs.Add("است", "#است"); pureVerbs.ForEach(verb => { Conjugations(verb).ForEach(tense => { if (!this._verbs.ContainsKey(tense)) { this._verbs.Add(tense, verb); } }); }); if (joinedVerbParts) { pureVerbs.ForEach(verb => { var bon = verb.Split('#')[0]; tokenizer.AfterVerbs.ToList().ForEach(afterVerb => { this._verbs.Add(bon + "ه " + afterVerb, verb); this._verbs.Add("ن" + bon + "ه " + afterVerb, verb); }); tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb => { this._verbs.Add(beforeVerb + " " + bon, verb); }); }); } }
public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts) { Stemmer stemmer = new Stemmer(); this._words = new HashSet<string>(); foreach (var line in File.ReadLines(wordsFile)) this._words.Add(line.Trim()); WordTokenizer tokenizer = new WordTokenizer(verbsFile); var pureVerbs = new List<string>(File.ReadAllLines(verbsFile).Reverse()); this._verbs = new Hashtable(); this._verbs.Add("است", "#است"); pureVerbs.ForEach(verb => { Conjugations(verb).ForEach(tense => { if (!this._verbs.ContainsKey(tense)) this._verbs.Add(tense, verb); }); }); if (joinedVerbParts) { pureVerbs.ForEach(verb => { string[] parts = verb.Split('#'); tokenizer.AfterVerbs.ToList().ForEach(afterVerb => { this._verbs.Add(parts[0] + "ه " + afterVerb, verb); }); tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb => { this._verbs.Add(beforeVerb + " " + parts[0], verb); }); }); } }