예제 #1
0
        public void StemTest()
        {
            Stemmer stemmer = new Stemmer();

            string input, expected, actual;

            input = "کتابی";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتاب‌ها";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتاب‌هایی";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "کتابهایشان";
            expected = "کتاب";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input = "اندیشه‌اش";
            expected = "اندیشه";
            actual = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");
        }
예제 #2
0
        public void StemTest()
        {
            Stemmer stemmer = new Stemmer();

            string input, expected, actual;

            input    = "کتابی";
            expected = "کتاب";
            actual   = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input    = "کتاب‌ها";
            expected = "کتاب";
            actual   = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input    = "کتاب‌هایی";
            expected = "کتاب";
            actual   = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input    = "کتابهایشان";
            expected = "کتاب";
            actual   = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");

            input    = "اندیشه‌اش";
            expected = "اندیشه";
            actual   = stemmer.Stem(input);
            Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'");
        }
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            var term = TermAttribute.Term;

            if (Exclusions?.Contains(term) == true)
            {
                return(true);
            }

            Stemmer.SetCurrent(term);
            Stemmer.Stem();
            var buffer = Stemmer.GetCurrent();

            if (buffer != null && !buffer.Equals(term))
            {
                TermAttribute.SetTermBuffer(buffer);
            }

            return(true);
        }
        public string StemString(string target)
        {
            if (string.IsNullOrEmpty(target))
            {
                throw new ArgumentException(nameof(target));
            }

            target = target.ToLower();

            var stemmer = new Stemmer();

            foreach (var character in target)
            {
                stemmer.Add(character);
            }

            stemmer.Stem();
            if (stemmer.GetResultLength() != target.Length)
            {
                // The stemming process removed nothing
                return(target);
            }

            var stem = stemmer.ToString();

            return(stem);
        }
예제 #5
0
파일: Tests.cs 프로젝트: mkubasz/other
 public void Stemmer_ok()
 {
     CollectionAssert.AreEqual(new List <string> {
         "m", "ma", "mat", "at", "t"
     }, Stemmer.Stem("mat").Select(s => s.Word).ToList());
     CollectionAssert.AreEqual(new List <double> {
         1 / 3.0, 2 / 3.0, 3 / 3.0, 2 / 3.0, 1 / 3.0
     }, Stemmer.Stem("mat").Select(s => s.Score).ToList());
 }
예제 #6
0
        private void GenerateInlines()
        {
            string[] searchWords = SearchIndex.StemWords(SearchIndex.GetWords(this.SearchText));
            if (searchWords.Length == 0)
            {
                return;
            }

            string[] inputWords = SearchIndex.GetWords(this.InputText);
            if (inputWords.Length == 0)
            {
                return;
            }

            List <string> highlightWords = new List <string>();
            Stemmer       stemmer        = new Stemmer();

            foreach (string word in inputWords)
            {
                if (Enumerable.Contains <string>(searchWords, stemmer.Stem(word)))
                {
                    highlightWords.Add(word);
                }
            }

            string text  = this.InputText;
            Regex  regex = GetRegexFromWordList(highlightWords.ToArray());
            int    index = 0;

            if (regex != null)
            {
                MatchCollection matches = regex.Matches(text);

                foreach (Match match in matches)
                {
                    if (match.Index > index)
                    {
                        this.Inlines.Add(new Run(text.Substring(index, match.Index - index)));
                    }

                    string searchWord = text.Substring(match.Index, match.Length);
                    this.Inlines.Add(new Bold(new Run(searchWord)));

                    index = match.Index + match.Length;
                }
            }

            if (index < text.Length)
            {
                this.Inlines.Add(new Run(text.Substring(index, text.Length - index)));
            }

            Assert.IsTrue(this.Inlines.Count != 0);
        }
예제 #7
0
        public int GetTextForMark(string text)
        {
            Stemmer       stemmer      = new Stemmer();
            List <string> stemmedWords = new List <string>();
            var           words        = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);

            for (int i = 0; i < words.Count(); i++)
            {
                words[i] = stemmer.Stem(words[i]);
            }
            stemmedWords = words.ToList <string>();
            StreamReader reader = new StreamReader("correlations.txt");
            Dictionary <string, double> correlationTable = new Dictionary <string, double>();
            string line;

            while ((line = reader.ReadLine()) != null)
            {
                double   correlation;
                string[] splittedLine = line.Split(' ');
                if (splittedLine.Count() == 2)
                {
                    if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "")
                    {
                        correlationTable.Add(splittedLine[0], correlation);
                    }
                }
            }
            double logOfP = 0;

            foreach (var word in stemmedWords)
            {
                if (correlationTable.ContainsKey(word))
                {
                    logOfP += correlationTable[word];
                }
            }
            double pFraction = Math.Exp(logOfP);
            double result    = pFraction / (pFraction + 1);

            if (result > 0.5)
            {
                return(1);
            }
            return(0);
        }
        public int GetTextForMark(string text)
        {
            Stemmer stemmer = new Stemmer();
            List<string> stemmedWords = new List<string>();
            var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries);
            for(int i = 0; i< words.Count();i++)
            {
                words[i] = stemmer.Stem(words[i]);
            }
            stemmedWords = words.ToList<string>();
            StreamReader reader = new StreamReader("correlations.txt");
            Dictionary<string,double> correlationTable = new Dictionary<string,double>();
            string line;
            while((line = reader.ReadLine()) !=null)
            {
                double correlation;
                string[] splittedLine = line.Split(' ');
                if (splittedLine.Count() == 2)
                {
                    if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "")
                    {
                        correlationTable.Add(splittedLine[0], correlation);
                    }
                }

            }
            double logOfP = 0;
            foreach (var word in stemmedWords)
            {
                if (correlationTable.ContainsKey(word))
                {
                    logOfP += correlationTable[word];
                }
            }
            double pFraction = Math.Exp(logOfP) ;
            double result = pFraction/(pFraction + 1);
            if (result > 0.5)
            {
                return 1;
            }
            return 0;
        }
예제 #9
0
        /// <summary>
        /// Processes the specified text.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns>The resulting document object.</returns>
        public Document Process(string text)
        {
            var TempText = NormalizerManager.Normalize(text);
            var Tokens   = Tokenizer.Tokenize(TempText, TokenizerLanguage);

            Tokens = NormalizerManager.Normalize(Tokens);
            Tokens = Stemmer.Stem(Tokens, StemmerLanguage);
            Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage);

            var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            for (int x = 0; x < Sentences.Length; ++x)
            {
                var Sentence = Sentences[x];
                Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage);
            }
            Tokens    = EntityFinder.Find(Tokens, EntityFinderType);
            Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage);

            return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage));
        }
        private static void GenerateXmlFromFile(string fileName)
        {
            Stemmer stemmer = new Stemmer();
            var htmlDoc = new HtmlDocument();
            htmlDoc.Load(fileName, Encoding.UTF8);
            var rootNode = htmlDoc.DocumentNode;
            var marksText = rootNode.SelectNodes("//span[@class='grade-label']");
            List<int> grades = new List<int>();
            // Get marks list
            if (marksText != null)
            {
                foreach (var mark in marksText)
                {
                    switch (mark.InnerText)
                    {
                        case "отличная модель":
                            grades.Add(5); break;
                        case "хорошая модель":
                            grades.Add(4); break;
                        case "обычная модель":
                            grades.Add(3); break;
                        case "плохая модель":
                            grades.Add(2); break;
                        case "ужасная модель":
                            grades.Add(1); break;
                        default: break;
                    }
                }
                //Get texts for marks
                List<string> advantages = new List<string>();
                List<string> disadvantages = new List<string>();
                List<string> comments = new List<string>();
                var texts = rootNode.SelectNodes("//div[@class='data']");
                foreach (var text in texts)
                {

                    if (text.ChildNodes[2].Name == "div")
                    {
                        //Достоинства
                        advantages.Add(text.ChildNodes[3].InnerText);
                        //Недостатки
                        if (text.ChildNodes.Count == 5)
                        {
                            disadvantages.Add(text.ChildNodes[4].InnerText);
                        }
                        //Комментарий
                        if (text.ChildNodes.Count == 6)
                        {
                            comments.Add(text.ChildNodes[5].InnerText);
                        }
                    }
                    else
                    {
                        //Достоинства
                        advantages.Add(text.ChildNodes[2].InnerText);
                        //Недостатки
                        if (text.ChildNodes.Count == 4)
                        {
                            disadvantages.Add(text.ChildNodes[3].InnerText);
                        }
                        //Комментарий
                        if (text.ChildNodes.Count == 5)
                        {
                            comments.Add(text.ChildNodes[4].InnerText);
                        }
                    }
                }

                //Generating XML
                for (int i = 0; i < advantages.Count; i++)
                {
                    var xml = new XmlDocument();
                    var xmlNode = xml.CreateNode(XmlNodeType.XmlDeclaration, "", "");
                    xml.AppendChild(xmlNode);
                    var xmlElem = xml.CreateElement("", "review", "");
                    xml.AppendChild(xmlElem);

                    char[] delimiterChars = { ' ', ',', '.', ':', '\t' };

                    if (advantages.Count > i)
                    {
                        string result_advantages = String.Empty;
                        string[] advantages_split = advantages[i].Split(delimiterChars);
                        foreach (var word in advantages_split)
                        {
                            result_advantages = String.Concat(result_advantages," ",stemmer.Stem(word));
                        }
                        var xmlAdvantages = xml.CreateElement("", "advantages", "");
                        var xmlAdvatagesText = xml.CreateTextNode(result_advantages);
                        xmlAdvantages.AppendChild(xmlAdvatagesText);
                        xml.LastChild.AppendChild(xmlAdvantages);
                    }

                    if (disadvantages.Count > i)
                    {
                        string result_disadvantages = String.Empty;
                        string[] disadvantages_split = disadvantages[i].Split(delimiterChars);
                        foreach (var word in disadvantages_split)
                        {
                            result_disadvantages = String.Concat(result_disadvantages, " ", stemmer.Stem(word));
                        }
                        var xmlDisadvantages = xml.CreateElement("", "disadvantages", "");
                        var xmlDisadvantagesText = xml.CreateTextNode(stemmer.Stem(result_disadvantages));
                        xmlDisadvantages.AppendChild(xmlDisadvantagesText);
                        xml.LastChild.AppendChild(xmlDisadvantages);
                    }

                    if (comments.Count > i)
                    {
                        string result_comments = String.Empty;
                        string[] comments_split = comments[i].Split(delimiterChars);
                        foreach (var word in comments_split)
                        {
                            result_comments = String.Concat(result_comments, " ", stemmer.Stem(word));
                        }
                        var xmlComments = xml.CreateElement("", "comments", "");
                        var xmlCommentsText = xml.CreateTextNode(result_comments);
                        xmlComments.AppendChild(xmlCommentsText);
                        xml.LastChild.AppendChild(xmlComments);
                    }
                    if (grades.Count > i)
                    {
                        var xmlGrade = xml.CreateElement("", "grade", "");
                        var xmlGradeText = xml.CreateTextNode(grades[i].ToString());
                        xmlGrade.AppendChild(xmlGradeText);
                        xml.LastChild.AppendChild(xmlGrade);
                    }
                    //generate path!
                    string path = String.Concat("c:\\xml\\", "xml_", Path.GetFileName(fileName), i.ToString(), ".xml");
                    xml.Save(path);

                }
            }
        }
 public static string ExtractStemFeatureFromSingleTokenAndUpdateItemFeatures(Stemmer stemmer, Dictionary<string, double> item, string tokenKey)
 {
     tokenKey = stemmer.Stem(tokenKey);
     item.IncreaseFeatureFrequency("stem_" + tokenKey, 1);
     return tokenKey;
 }