public void StemTest() { Stemmer stemmer = new Stemmer(); string input, expected, actual; input = "کتابی"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابها"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابهایی"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "کتابهایشان"; expected = "کتاب"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); input = "اندیشهاش"; expected = "اندیشه"; actual = stemmer.Stem(input); Assert.AreEqual(expected, actual, "Failed to stem of '" + input + "'"); }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } var term = TermAttribute.Term; if (Exclusions?.Contains(term) == true) { return(true); } Stemmer.SetCurrent(term); Stemmer.Stem(); var buffer = Stemmer.GetCurrent(); if (buffer != null && !buffer.Equals(term)) { TermAttribute.SetTermBuffer(buffer); } return(true); }
public string StemString(string target) { if (string.IsNullOrEmpty(target)) { throw new ArgumentException(nameof(target)); } target = target.ToLower(); var stemmer = new Stemmer(); foreach (var character in target) { stemmer.Add(character); } stemmer.Stem(); if (stemmer.GetResultLength() != target.Length) { // The stemming process removed nothing return(target); } var stem = stemmer.ToString(); return(stem); }
public void Stemmer_ok() { CollectionAssert.AreEqual(new List <string> { "m", "ma", "mat", "at", "t" }, Stemmer.Stem("mat").Select(s => s.Word).ToList()); CollectionAssert.AreEqual(new List <double> { 1 / 3.0, 2 / 3.0, 3 / 3.0, 2 / 3.0, 1 / 3.0 }, Stemmer.Stem("mat").Select(s => s.Score).ToList()); }
private void GenerateInlines() { string[] searchWords = SearchIndex.StemWords(SearchIndex.GetWords(this.SearchText)); if (searchWords.Length == 0) { return; } string[] inputWords = SearchIndex.GetWords(this.InputText); if (inputWords.Length == 0) { return; } List <string> highlightWords = new List <string>(); Stemmer stemmer = new Stemmer(); foreach (string word in inputWords) { if (Enumerable.Contains <string>(searchWords, stemmer.Stem(word))) { highlightWords.Add(word); } } string text = this.InputText; Regex regex = GetRegexFromWordList(highlightWords.ToArray()); int index = 0; if (regex != null) { MatchCollection matches = regex.Matches(text); foreach (Match match in matches) { if (match.Index > index) { this.Inlines.Add(new Run(text.Substring(index, match.Index - index))); } string searchWord = text.Substring(match.Index, match.Length); this.Inlines.Add(new Bold(new Run(searchWord))); index = match.Index + match.Length; } } if (index < text.Length) { this.Inlines.Add(new Run(text.Substring(index, text.Length - index))); } Assert.IsTrue(this.Inlines.Count != 0); }
public int GetTextForMark(string text) { Stemmer stemmer = new Stemmer(); List <string> stemmedWords = new List <string>(); var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Count(); i++) { words[i] = stemmer.Stem(words[i]); } stemmedWords = words.ToList <string>(); StreamReader reader = new StreamReader("correlations.txt"); Dictionary <string, double> correlationTable = new Dictionary <string, double>(); string line; while ((line = reader.ReadLine()) != null) { double correlation; string[] splittedLine = line.Split(' '); if (splittedLine.Count() == 2) { if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "") { correlationTable.Add(splittedLine[0], correlation); } } } double logOfP = 0; foreach (var word in stemmedWords) { if (correlationTable.ContainsKey(word)) { logOfP += correlationTable[word]; } } double pFraction = Math.Exp(logOfP); double result = pFraction / (pFraction + 1); if (result > 0.5) { return(1); } return(0); }
public int GetTextForMark(string text) { Stemmer stemmer = new Stemmer(); List<string> stemmedWords = new List<string>(); var words = text.Split(new[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries); for(int i = 0; i< words.Count();i++) { words[i] = stemmer.Stem(words[i]); } stemmedWords = words.ToList<string>(); StreamReader reader = new StreamReader("correlations.txt"); Dictionary<string,double> correlationTable = new Dictionary<string,double>(); string line; while((line = reader.ReadLine()) !=null) { double correlation; string[] splittedLine = line.Split(' '); if (splittedLine.Count() == 2) { if (Double.TryParse(splittedLine[1], out correlation) && splittedLine[0] != "") { correlationTable.Add(splittedLine[0], correlation); } } } double logOfP = 0; foreach (var word in stemmedWords) { if (correlationTable.ContainsKey(word)) { logOfP += correlationTable[word]; } } double pFraction = Math.Exp(logOfP) ; double result = pFraction/(pFraction + 1); if (result > 0.5) { return 1; } return 0; }
/// <summary> /// Processes the specified text. /// </summary> /// <param name="text">The text.</param> /// <returns>The resulting document object.</returns> public Document Process(string text) { var TempText = NormalizerManager.Normalize(text); var Tokens = Tokenizer.Tokenize(TempText, TokenizerLanguage); Tokens = NormalizerManager.Normalize(Tokens); Tokens = Stemmer.Stem(Tokens, StemmerLanguage); Tokens = StopWordsManager.MarkStopWords(Tokens, StopWordsLanguage); var Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); for (int x = 0; x < Sentences.Length; ++x) { var Sentence = Sentences[x]; Sentence.Tokens = POSTagger.Tag(Sentence.Tokens, POSTaggerLanguage); } Tokens = EntityFinder.Find(Tokens, EntityFinderType); Sentences = SentenceDetector.Detect(Tokens, SentenceDetectorLanguage); return(new Document(Sentences, Tokens, text, FeatureExtractor, TextSummarizer, Tokenizer, TokenizerLanguage)); }
private static void GenerateXmlFromFile(string fileName) { Stemmer stemmer = new Stemmer(); var htmlDoc = new HtmlDocument(); htmlDoc.Load(fileName, Encoding.UTF8); var rootNode = htmlDoc.DocumentNode; var marksText = rootNode.SelectNodes("//span[@class='grade-label']"); List<int> grades = new List<int>(); // Get marks list if (marksText != null) { foreach (var mark in marksText) { switch (mark.InnerText) { case "отличная модель": grades.Add(5); break; case "хорошая модель": grades.Add(4); break; case "обычная модель": grades.Add(3); break; case "плохая модель": grades.Add(2); break; case "ужасная модель": grades.Add(1); break; default: break; } } //Get texts for marks List<string> advantages = new List<string>(); List<string> disadvantages = new List<string>(); List<string> comments = new List<string>(); var texts = rootNode.SelectNodes("//div[@class='data']"); foreach (var text in texts) { if (text.ChildNodes[2].Name == "div") { //Достоинства advantages.Add(text.ChildNodes[3].InnerText); //Недостатки if (text.ChildNodes.Count == 5) { disadvantages.Add(text.ChildNodes[4].InnerText); } //Комментарий if (text.ChildNodes.Count == 6) { comments.Add(text.ChildNodes[5].InnerText); } } else { //Достоинства advantages.Add(text.ChildNodes[2].InnerText); //Недостатки if (text.ChildNodes.Count == 4) { disadvantages.Add(text.ChildNodes[3].InnerText); } //Комментарий if (text.ChildNodes.Count == 5) { comments.Add(text.ChildNodes[4].InnerText); } } } //Generating XML for (int i = 0; i < advantages.Count; i++) { var xml = new XmlDocument(); var xmlNode = xml.CreateNode(XmlNodeType.XmlDeclaration, "", ""); xml.AppendChild(xmlNode); var xmlElem = xml.CreateElement("", "review", ""); xml.AppendChild(xmlElem); char[] delimiterChars = { ' ', ',', '.', ':', '\t' }; if (advantages.Count > i) { string result_advantages = String.Empty; string[] advantages_split = advantages[i].Split(delimiterChars); foreach (var word in advantages_split) { result_advantages = String.Concat(result_advantages," ",stemmer.Stem(word)); } var xmlAdvantages = xml.CreateElement("", "advantages", ""); var xmlAdvatagesText = xml.CreateTextNode(result_advantages); xmlAdvantages.AppendChild(xmlAdvatagesText); xml.LastChild.AppendChild(xmlAdvantages); } if (disadvantages.Count > i) { string result_disadvantages = String.Empty; string[] disadvantages_split = disadvantages[i].Split(delimiterChars); foreach (var word in disadvantages_split) { result_disadvantages = String.Concat(result_disadvantages, " ", stemmer.Stem(word)); } var xmlDisadvantages = xml.CreateElement("", "disadvantages", ""); var xmlDisadvantagesText = xml.CreateTextNode(stemmer.Stem(result_disadvantages)); xmlDisadvantages.AppendChild(xmlDisadvantagesText); xml.LastChild.AppendChild(xmlDisadvantages); } if (comments.Count > i) { string result_comments = String.Empty; string[] comments_split = comments[i].Split(delimiterChars); foreach (var word in comments_split) { result_comments = String.Concat(result_comments, " ", stemmer.Stem(word)); } var xmlComments = xml.CreateElement("", "comments", ""); var xmlCommentsText = xml.CreateTextNode(result_comments); xmlComments.AppendChild(xmlCommentsText); xml.LastChild.AppendChild(xmlComments); } if (grades.Count > i) { var xmlGrade = xml.CreateElement("", "grade", ""); var xmlGradeText = xml.CreateTextNode(grades[i].ToString()); xmlGrade.AppendChild(xmlGradeText); xml.LastChild.AppendChild(xmlGrade); } //generate path! string path = String.Concat("c:\\xml\\", "xml_", Path.GetFileName(fileName), i.ToString(), ".xml"); xml.Save(path); } } }
public static string ExtractStemFeatureFromSingleTokenAndUpdateItemFeatures(Stemmer stemmer, Dictionary<string, double> item, string tokenKey) { tokenKey = stemmer.Stem(tokenKey); item.IncreaseFeatureFrequency("stem_" + tokenKey, 1); return tokenKey; }