private Dictionary <string, string> GetStemParentDictionary(string[] allWords) { Dictionary <string, string> stemParentDictionary = new Dictionary <string, string>(); Dictionary <string, int> parentWordCount = new Dictionary <string, int>(); foreach (string word in allWords) { string stem = Stemmer.GetStem(word); if (!parentWordCount.ContainsKey(word)) { parentWordCount[word] = 0; } parentWordCount[word] += 1; if (stemParentDictionary.ContainsKey(stem)) { if (parentWordCount[word] > parentWordCount[stemParentDictionary[stem]]) { stemParentDictionary[stem] = word; } } else { stemParentDictionary[stem] = word; } } return(stemParentDictionary); }
public void Verify_Basic() { var input = "caresses"; var stemmed = Stemmer.GetStem(input); Assert.AreEqual(stemmed, "caress"); }
public void TestStemming() { string word = "caresses"; string stemmed = Stemmer.GetStem(word); Assert.AreEqual(stemmed, "caress"); }
public Dictionary <string, int> GetTopNWordsDictionary(int N) { string[] ignoreWords = { "*" }; Dictionary <string, int> wordCount = new Dictionary <string, int>(); StringBuilder sbFullText = new StringBuilder(); foreach (children child in this.children) { sbFullText.Append(child.SubtreeText); sbFullText.Append(" "); } string[] allWords = GetAllWords(sbFullText.ToString()); wordCount = new Dictionary <string, int>(); Dictionary <string, string> stemParent = new Dictionary <string, string>(); foreach (string word in allWords) { try { string stemmed = Stemmer.GetStem(word); if (stemParent.ContainsKey(stemmed)) { if (stemParent[stemmed].Length < word.Length) { stemParent[stemmed] = word; } } else { stemParent[stemmed] = word; } if (stopWords.Contains(stemmed.ToLower())) { continue; } if (!wordCount.ContainsKey(stemmed) && !ignoreWords.Contains(stemmed)) { wordCount[stemmed] = 1; } else { wordCount[stemmed] += 1; } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } wordCount = wordCount.OrderByDescending(x => x.Value).Take(N).ToDictionary(kvp => stemParent[kvp.Key], kvp => kvp.Value); return(wordCount); }
Dictionary <string, HashSet <int> > GetWordIDMapping(children child) { Dictionary <string, HashSet <int> > wordIDMapping = new Dictionary <string, HashSet <int> >(); string[] allWords = GetAllWords(child.text); foreach (string word in allWords) { if (stopWords.Contains(word.ToLower())) { continue; } if (word.Length < 3 && word.Any(c => char.IsLower(c))) { continue; } string stem = Stemmer.GetStem(word); if (!wordIDMapping.ContainsKey(stem)) { wordIDMapping[stem] = new HashSet <int>(); } wordIDMapping[stem].Add(child.id); } foreach (children childitem in child.Children) { Dictionary <string, HashSet <int> > mapping = GetWordIDMapping(childitem); foreach (var kvp in mapping) { if (wordIDMapping.ContainsKey(kvp.Key)) { wordIDMapping[kvp.Key].UnionWith(kvp.Value); } else { wordIDMapping[kvp.Key] = kvp.Value; } } } return(wordIDMapping); }
/* * This method sentence-tokenizes all top level comments * The best sentences are those where the words in the sentence * occur in the most number of subtree items within the current * top level comment */ public List <SentenceObj> GetTopSentences(int N) { List <SentenceObj> topSentenceObjs = new List <SentenceObj>(); List <string> topSentences = new List <string>(); Dictionary <string, double> sentenceScores = new Dictionary <string, double>(); Dictionary <string, string> sentenceAuthors = new Dictionary <string, string>(); Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>(); Dictionary <string, int> sentenceIds = new Dictionary <string, int>(); foreach (children child in children) { try { Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child); string text = child.text; List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text)); string bestSentence = currSentences[0]; double currMax = double.MinValue; foreach (string sentence in currSentences) { string[] allWords = GetAllWords(sentence); bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2); if (goodSentence) { double weightedScore = 0; int totalIDCount = 0; foreach (string word in allWords) { if (!stopWords.Contains(word.ToLower())) { string stemmedWord = Stemmer.GetStem(word); if (wordIDMapping.ContainsKey(stemmedWord)) { HashSet <int> idsContainingWord = wordIDMapping[stemmedWord]; totalIDCount += idsContainingWord.Count; weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1); } } } //add some weighting so that longer sentences have more weight weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length)))); double avgScore = weightedScore / allWords.Length; if (avgScore > currMax) { currMax = avgScore; bestSentence = sentence; } } } sentenceScores[bestSentence] = currMax; sentenceAuthors[bestSentence] = child.author; sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child)); sentenceIds[bestSentence] = child.id; } catch (Exception ex) { } } topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList(); foreach (var sent in topSentences) { SentenceObj sentenceObj = new SentenceObj() { Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id }; topSentenceObjs.Add(sentenceObj); } topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList(); return(topSentenceObjs); }
public Dictionary <string, List <CommentObj> > GetNamedObjects(int N) { StringBuilder sbAllWords = new StringBuilder(); foreach (children child in children) { sbAllWords.Append(child.SubtreeText); sbAllWords.Append(" "); } string[] allWords = GetAllWords(sbAllWords.ToString()); Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords); List <string> namedObjects = new List <string>(); children rootNode = new children(); List <HashSet <int> > rootChildIDs = new List <HashSet <int> >(); foreach (children child in children) { GetChildIDHashSetList(child); HashSet <int> currChildIDs = new HashSet <int>(); currChildIDs.Add(child.id); foreach (var item in child.ChildIDList) { currChildIDs.UnionWith(item); } rootChildIDs.Add(currChildIDs); } rootNode.ChildIDList = rootChildIDs; NodeList = new List <children>(); NodeList.Add(rootNode); foreach (children child in children) { PopulateNodeList(child); } Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(); //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>(); Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >(); foreach (var kvp in wordIDMapping) { List <children> currLCAList = new List <children>(); int numLCAs = 0; foreach (children node in NodeList) { int numBranchesWithWord = 0; foreach (var childIDBranch in node.ChildIDList) { if (childIDBranch.Intersect(kvp.Value).Count() > 0) { numBranchesWithWord += 1; } } if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1) { currLCAList.Add(node); } } WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList; } namedObjects = WordLCAList .OrderByDescending(x => x.Value.Count) .Select(x => x.Key) .Where(y => CommonWords.GetFrequency(y) < 1) .Where(a => char.IsUpper(a[0])) .Where(b => b.Length > 1) .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s"))) .Take(N) .ToList(); //namedObjects.Sort(); Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >(); foreach (string namedObject in namedObjects) { List <CommentObj> commentObjsForWord = new List <CommentObj>(); string stem = Stemmer.GetStem(namedObject); HashSet <int> idsWithWord = wordIDMapping[stem]; foreach (int id in idsWithWord) { children child = GetNodeById(id); CommentObj commentObj = new CommentObj() { Id = id, Text = child.text }; commentObjsForWord.Add(commentObj); } namedObjectDictionary[namedObject] = commentObjsForWord; } var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]); return(ordered); }