Beispiel #1
0
 /// <summary>
 /// Gets list of raw sentences
 /// </summary>
 public IEnumerable <ConcurrentDependencyGraph> RawParse(List <string> sentences)
 {
     foreach (var sentence in sentences)
     {
         var words = WordTokenizer.Tokenize(sentence);
         yield return(RawParse(Tagger.BatchTag(words)));
     }
 }
Beispiel #2
0
 public BijankhanReader(string bijankhanFile, bool joinedVerbParts, string posMap)
 {
     this._bijankhanFile   = bijankhanFile;
     this._joinedVerbParts = joinedVerbParts;
     this._posMap          = posMap;
     this.normalizer       = new Normalizer(true, false, true);
     this.tokenizer        = new WordTokenizer();
 }
Beispiel #3
0
 public BijankhanReader(string bijankhanFile, bool joinedVerbParts, string posMap)
 {
     this._bijankhanFile = bijankhanFile;
     this._joinedVerbParts = joinedVerbParts;
     this._posMap = posMap;
     this.normalizer = new Normalizer(true, false, true);
     this.tokenizer = new WordTokenizer();
 }
Beispiel #4
0
        public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts)
        {
            Stemmer stemmer = new Stemmer();

            this._words = new HashSet <string>();
            foreach (var line in File.ReadLines(wordsFile))
            {
                this._words.Add(line.Trim());
            }

            WordTokenizer tokenizer = new WordTokenizer(verbsFile);

            var pureVerbs = new List <string>(File.ReadAllLines(verbsFile).Reverse());

            this._verbs = new Hashtable();
            this._verbs.Add("است", "#است");
            pureVerbs.ForEach(verb =>
            {
                Conjugations(verb).ForEach(tense =>
                {
                    if (!this._verbs.ContainsKey(tense))
                    {
                        this._verbs.Add(tense, verb);
                    }
                });
            });

            if (joinedVerbParts)
            {
                pureVerbs.ForEach(verb =>
                {
                    var bon = verb.Split('#')[0];
                    tokenizer.AfterVerbs.ToList().ForEach(afterVerb =>
                    {
                        this._verbs.Add(bon + "ه " + afterVerb, verb);
                        this._verbs.Add("ن" + bon + "ه " + afterVerb, verb);
                    });
                    tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb =>
                    {
                        this._verbs.Add(beforeVerb + " " + bon, verb);
                    });
                });
            }
        }
Beispiel #5
0
        public Lemmatizer(string wordsFile, string verbsFile, bool joinedVerbParts)
        {
            Stemmer stemmer = new Stemmer();

            this._words = new HashSet<string>();
            foreach (var line in File.ReadLines(wordsFile))
                this._words.Add(line.Trim());

            WordTokenizer tokenizer = new WordTokenizer(verbsFile);

            var pureVerbs = new List<string>(File.ReadAllLines(verbsFile).Reverse());

            this._verbs = new Hashtable();
            this._verbs.Add("است", "#است");
            pureVerbs.ForEach(verb =>
            {
                Conjugations(verb).ForEach(tense =>
                {
                    if (!this._verbs.ContainsKey(tense))
                        this._verbs.Add(tense, verb);
                });
            });

            if (joinedVerbParts)
            {
                pureVerbs.ForEach(verb =>
                {
                    string[] parts = verb.Split('#');
                    tokenizer.AfterVerbs.ToList().ForEach(afterVerb =>
                    {
                        this._verbs.Add(parts[0] + "ه " + afterVerb, verb);
                    });
                    tokenizer.BeforeVerbs.ToList().ForEach(beforeVerb =>
                    {
                        this._verbs.Add(beforeVerb + " " + parts[0], verb);
                    });
                });
            }
        }