/* * This method sentence-tokenizes all top level comments * The best sentences are those where the words in the sentence * occur in the most number of subtree items within the current * top level comment */ public List <SentenceObj> GetTopSentences(int N) { List <SentenceObj> topSentenceObjs = new List <SentenceObj>(); List <string> topSentences = new List <string>(); Dictionary <string, double> sentenceScores = new Dictionary <string, double>(); Dictionary <string, string> sentenceAuthors = new Dictionary <string, string>(); Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>(); Dictionary <string, int> sentenceIds = new Dictionary <string, int>(); foreach (children child in children) { try { Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child); string text = child.text; List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text)); string bestSentence = currSentences[0]; double currMax = double.MinValue; foreach (string sentence in currSentences) { string[] allWords = GetAllWords(sentence); bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2); if (goodSentence) { double weightedScore = 0; int totalIDCount = 0; foreach (string word in allWords) { if (!stopWords.Contains(word.ToLower())) { string stemmedWord = Stemmer.GetStem(word); if (wordIDMapping.ContainsKey(stemmedWord)) { HashSet <int> idsContainingWord = wordIDMapping[stemmedWord]; totalIDCount += idsContainingWord.Count; weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1); } } } //add some weighting so that longer sentences have more weight weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length)))); double avgScore = weightedScore / allWords.Length; if (avgScore > currMax) { currMax = avgScore; bestSentence = sentence; } } } sentenceScores[bestSentence] = currMax; sentenceAuthors[bestSentence] = child.author; sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child)); sentenceIds[bestSentence] = child.id; } catch (Exception ex) { } } topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList(); foreach (var sent in topSentences) { SentenceObj sentenceObj = new SentenceObj() { Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id }; topSentenceObjs.Add(sentenceObj); } topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList(); return(topSentenceObjs); }
/* * This method sentence-tokenizes all top level comments * The best sentences are those where the words in the sentence * occur in the most number of subtree items within the current * top level comment */ public List<SentenceObj> GetTopSentences(int N) { List<SentenceObj> topSentenceObjs = new List<SentenceObj>(); List<string> topSentences = new List<string>(); Dictionary<string,double> sentenceScores = new Dictionary<string, double>(); Dictionary<string,string> sentenceAuthors = new Dictionary<string, string>(); Dictionary<string,string> sentenceCommentTrees = new Dictionary<string, string>(); Dictionary<string,int> sentenceIds = new Dictionary<string, int>(); foreach (children child in children) { try { Dictionary<string, HashSet<int>> wordIDMapping = GetWordIDMapping(child); string text = child.text; List<string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text)); string bestSentence = currSentences[0]; double currMax = double.MinValue; foreach (string sentence in currSentences) { string[] allWords = GetAllWords(sentence); bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2); if (goodSentence) { double weightedScore = 0; int totalIDCount = 0; foreach (string word in allWords) { if (!stopWords.Contains(word.ToLower())) { string stemmedWord = Stemmer.GetStem(word); if (wordIDMapping.ContainsKey(stemmedWord)) { HashSet<int> idsContainingWord = wordIDMapping[stemmedWord]; totalIDCount += idsContainingWord.Count; weightedScore += idsContainingWord.Count * 1.0/(CommonWords.GetFrequency(word) + 1); } } } //add some weighting so that longer sentences have more weight weightedScore = weightedScore*(1 - (1/(Math.Pow(1.25, allWords.Length)))); double avgScore = weightedScore / allWords.Length; if (avgScore > currMax) { currMax = avgScore; bestSentence = sentence; } } } sentenceScores[bestSentence] = currMax; sentenceAuthors[bestSentence] = child.author; sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child)); sentenceIds[bestSentence] = child.id; } catch (Exception ex) { } } topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y=>!string.IsNullOrWhiteSpace(y.Key)).Select(x=>x.Key).ToList(); foreach (var sent in topSentences) { SentenceObj sentenceObj = new SentenceObj() { Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent],Id = sentenceIds[sent],StoryId = this.id }; topSentenceObjs.Add(sentenceObj); } topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList(); return topSentenceObjs; }