Пример #1
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }
Пример #2
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List<SentenceObj> GetTopSentences(int N)
        {
            List<SentenceObj> topSentenceObjs = new List<SentenceObj>();
            List<string> topSentences = new List<string>();
            Dictionary<string,double> sentenceScores = new Dictionary<string, double>();
            Dictionary<string,string> sentenceAuthors = new Dictionary<string, string>();
            Dictionary<string,string> sentenceCommentTrees = new Dictionary<string, string>();
            Dictionary<string,int> sentenceIds = new Dictionary<string, int>();
            foreach (children child in children)
            {
                try
                {
                    Dictionary<string, HashSet<int>> wordIDMapping = GetWordIDMapping(child);
                    string text = child.text;
                    List<string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string bestSentence = currSentences[0];
                    double currMax = double.MinValue;
                    foreach (string sentence in currSentences)
                    {

                        string[] allWords = GetAllWords(sentence);
                        bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int totalIDCount = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet<int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0/(CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore*(1 - (1/(Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence] = currMax;
                    sentenceAuthors[bestSentence] = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence] = child.id;
                }
                catch (Exception ex)
                {

                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y=>!string.IsNullOrWhiteSpace(y.Key)).Select(x=>x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent],Id = sentenceIds[sent],StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return topSentenceObjs;
        }