예제 #1
0
        private Dictionary <string, string> GetStemParentDictionary(string[] allWords)
        {
            Dictionary <string, string> stemParentDictionary = new Dictionary <string, string>();
            Dictionary <string, int>    parentWordCount      = new Dictionary <string, int>();

            foreach (string word in allWords)
            {
                string stem = Stemmer.GetStem(word);
                if (!parentWordCount.ContainsKey(word))
                {
                    parentWordCount[word] = 0;
                }
                parentWordCount[word] += 1;
                if (stemParentDictionary.ContainsKey(stem))
                {
                    if (parentWordCount[word] > parentWordCount[stemParentDictionary[stem]])
                    {
                        stemParentDictionary[stem] = word;
                    }
                }
                else
                {
                    stemParentDictionary[stem] = word;
                }
            }
            return(stemParentDictionary);
        }
예제 #2
0
        public void Verify_Basic()
        {
            var input   = "caresses";
            var stemmed = Stemmer.GetStem(input);

            Assert.AreEqual(stemmed, "caress");
        }
예제 #3
0
        public void TestStemming()
        {
            string word    = "caresses";
            string stemmed = Stemmer.GetStem(word);

            Assert.AreEqual(stemmed, "caress");
        }
예제 #4
0
        public Dictionary <string, int> GetTopNWordsDictionary(int N)
        {
            string[] ignoreWords = { "*" };

            Dictionary <string, int> wordCount  = new Dictionary <string, int>();
            StringBuilder            sbFullText = new StringBuilder();

            foreach (children child in this.children)
            {
                sbFullText.Append(child.SubtreeText);
                sbFullText.Append(" ");
            }
            string[] allWords = GetAllWords(sbFullText.ToString());
            wordCount = new Dictionary <string, int>();


            Dictionary <string, string> stemParent = new Dictionary <string, string>();

            foreach (string word in allWords)
            {
                try
                {
                    string stemmed = Stemmer.GetStem(word);
                    if (stemParent.ContainsKey(stemmed))
                    {
                        if (stemParent[stemmed].Length < word.Length)
                        {
                            stemParent[stemmed] = word;
                        }
                    }
                    else
                    {
                        stemParent[stemmed] = word;
                    }
                    if (stopWords.Contains(stemmed.ToLower()))
                    {
                        continue;
                    }
                    if (!wordCount.ContainsKey(stemmed) && !ignoreWords.Contains(stemmed))
                    {
                        wordCount[stemmed] = 1;
                    }
                    else
                    {
                        wordCount[stemmed] += 1;
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
            wordCount = wordCount.OrderByDescending(x => x.Value).Take(N).ToDictionary(kvp => stemParent[kvp.Key], kvp => kvp.Value);
            return(wordCount);
        }
예제 #5
0
        Dictionary <string, HashSet <int> > GetWordIDMapping(children child)
        {
            Dictionary <string, HashSet <int> > wordIDMapping = new Dictionary <string, HashSet <int> >();

            string[] allWords = GetAllWords(child.text);

            foreach (string word in allWords)
            {
                if (stopWords.Contains(word.ToLower()))
                {
                    continue;
                }
                if (word.Length < 3 && word.Any(c => char.IsLower(c)))
                {
                    continue;
                }
                string stem = Stemmer.GetStem(word);
                if (!wordIDMapping.ContainsKey(stem))
                {
                    wordIDMapping[stem] = new HashSet <int>();
                }
                wordIDMapping[stem].Add(child.id);
            }
            foreach (children childitem in child.Children)
            {
                Dictionary <string, HashSet <int> > mapping = GetWordIDMapping(childitem);
                foreach (var kvp in mapping)
                {
                    if (wordIDMapping.ContainsKey(kvp.Key))
                    {
                        wordIDMapping[kvp.Key].UnionWith(kvp.Value);
                    }
                    else
                    {
                        wordIDMapping[kvp.Key] = kvp.Value;
                    }
                }
            }
            return(wordIDMapping);
        }
예제 #6
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }
예제 #7
0
        public Dictionary <string, List <CommentObj> > GetNamedObjects(int N)
        {
            StringBuilder sbAllWords = new StringBuilder();

            foreach (children child in children)
            {
                sbAllWords.Append(child.SubtreeText);
                sbAllWords.Append(" ");
            }
            string[] allWords = GetAllWords(sbAllWords.ToString());
            Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords);
            List <string>         namedObjects = new List <string>();
            children              rootNode     = new children();
            List <HashSet <int> > rootChildIDs = new List <HashSet <int> >();

            foreach (children child in children)
            {
                GetChildIDHashSetList(child);
                HashSet <int> currChildIDs = new HashSet <int>();
                currChildIDs.Add(child.id);
                foreach (var item in child.ChildIDList)
                {
                    currChildIDs.UnionWith(item);
                }
                rootChildIDs.Add(currChildIDs);
            }
            rootNode.ChildIDList = rootChildIDs;
            NodeList             = new List <children>();
            NodeList.Add(rootNode);
            foreach (children child in children)
            {
                PopulateNodeList(child);
            }
            Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping();
            //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>();
            Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >();

            foreach (var kvp in wordIDMapping)
            {
                List <children> currLCAList = new List <children>();
                int             numLCAs     = 0;
                foreach (children node in NodeList)
                {
                    int numBranchesWithWord = 0;
                    foreach (var childIDBranch in node.ChildIDList)
                    {
                        if (childIDBranch.Intersect(kvp.Value).Count() > 0)
                        {
                            numBranchesWithWord += 1;
                        }
                    }
                    if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1)
                    {
                        currLCAList.Add(node);
                    }
                }
                WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList;
            }
            namedObjects = WordLCAList
                           .OrderByDescending(x => x.Value.Count)
                           .Select(x => x.Key)
                           .Where(y => CommonWords.GetFrequency(y) < 1)
                           .Where(a => char.IsUpper(a[0]))
                           .Where(b => b.Length > 1)
                           .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s")))
                           .Take(N)
                           .ToList();
            //namedObjects.Sort();
            Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >();

            foreach (string namedObject in namedObjects)
            {
                List <CommentObj> commentObjsForWord = new List <CommentObj>();
                string            stem        = Stemmer.GetStem(namedObject);
                HashSet <int>     idsWithWord = wordIDMapping[stem];
                foreach (int id in idsWithWord)
                {
                    children   child      = GetNodeById(id);
                    CommentObj commentObj = new CommentObj()
                    {
                        Id = id, Text = child.text
                    };
                    commentObjsForWord.Add(commentObj);
                }
                namedObjectDictionary[namedObject] = commentObjsForWord;
            }
            var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]);

            return(ordered);
        }