Exemplo n.º 1
0
        public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts)
        {
            Stemmer stemmer = new Stemmer();

            this._words = new HashSet <string>();
            foreach (var line in File.ReadLines(wordsFile))
            {
                this._words.Add(line.Trim());
            }

            WordTokenizer tokenizer = new WordTokenizer(verbsFile);

            var pureVerbs = new List <string>(File.ReadAllLines(verbsFile).Reverse());

            this._verbs = new Hashtable();
            this._verbs.Add("است", "#است");
            pureVerbs.ForEach(verb =>
            {
                Conjugations(verb).ForEach(tense =>
                {
                    if (!this._verbs.ContainsKey(tense))
                    {
                        this._verbs.Add(tense, verb);
                    }
                });
            });

            if (joinedVerbParts)
            {
                pureVerbs.ForEach(verb =>
                {
                    var bon = verb.Split('#')[0];
                    tokenizer.AfterVerbs.ToList().ForEach(afterVerb =>
                    {
                        this._verbs.Add(bon + "ه " + afterVerb, verb);
                        this._verbs.Add("ن" + bon + "ه " + afterVerb, verb);
                    });
                    tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb =>
                    {
                        this._verbs.Add(beforeVerb + " " + bon, verb);
                    });
                });
            }
        }
Exemplo n.º 2
0
        public string Lemmatize(string word, string pos)
        {
            if ((pos.Length == 0 || pos.Equals("V")) && this._verbs.ContainsKey(word))
            {
                return(this._verbs[word].ToString());
            }

            if (this._words.Contains(word))
            {
                return(word);
            }

            var stem = new Stemmer().Stem(word);

            if (this._words.Contains(stem))
            {
                return(stem);
            }

            return(word);
        }
Exemplo n.º 3
0
        public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts)
        {
            Stemmer stemmer = new Stemmer();

            this._words = new HashSet<string>();
            foreach (var line in File.ReadLines(wordsFile))
                this._words.Add(line.Trim());

            WordTokenizer tokenizer = new WordTokenizer(verbsFile);

            var pureVerbs = new List<string>(File.ReadAllLines(verbsFile).Reverse());

            this._verbs = new Hashtable();
            this._verbs.Add("است", "#است");
            pureVerbs.ForEach(verb =>
            {
                Conjugations(verb).ForEach(tense =>
                {
                    if (!this._verbs.ContainsKey(tense))
                        this._verbs.Add(tense, verb);
                });
            });

            if (joinedVerbParts)
            {
                pureVerbs.ForEach(verb =>
                {
                    string[] parts = verb.Split('#');
                    tokenizer.AfterVerbs.ToList().ForEach(afterVerb =>
                    {
                        this._verbs.Add(parts[0] + "ه " + afterVerb, verb);
                    });
                    tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb =>
                    {
                        this._verbs.Add(beforeVerb + " " + parts[0], verb);
                    });
                });
            }
        }
Exemplo n.º 4
0
        public string Lemmatize(string word, string pos)
        {
            if ((pos.Length == 0 || pos.Equals("V")) && this._verbs.ContainsKey(word))
                return this._verbs[word].ToString();

            if (this._words.Contains(word))
                return word;

            var stem = new Stemmer().Stem(word);
            if (this._words.Contains(stem))
                return stem;

            return word;
        }