Beispiel #1
0
        public KeywordAnalysis Analyze(string content)
        {
            KeywordAnalysis analysis = new KeywordAnalysis {
                Content = content
            };
            int wordCount  = 0;
            var titles     = TitleExtractor.Extract(content);
            var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount);

            //flatten list of words
            List <Word> allWords = new List <Word>();

            paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words)));

            analysis.WordCount  = wordCount;
            analysis.Paragraphs = paragraphs;
            analysis.Titles     = titles;

            int termTotal = 0;

            //run through each sentence and grab two and three word segments and add them to the termCount
            Dictionary <string, int> termOccurrenceCounts = GetWordTermOccurence(paragraphs);

            Dictionary <string, int>     termNw  = new Dictionary <string, int>();
            Dictionary <string, decimal> termsX2 = new Dictionary <string, decimal>();

            //this gets us termsG for frequent terms, and an initialized termsX2
            SortedDictionary <decimal, string> termsG = SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal);

            //now we have to fill termPg and termNw with values
            Dictionary <string, decimal> termPg = FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal);

            //now we have to fill the termFgw collection
            Dictionary <string, Dictionary <string, decimal> > termFwg = FillTermFwgCollection(paragraphs, termsG);

            string[] terms = new string[termsG.Count];
            termsG.Values.CopyTo(terms, 0);              //gives terms array where last term is the MAX g in G
            foreach (string w in terms)
            {
                decimal sumZ = 0;
                for (int i = 0; i < terms.Length - 1; i++)                 //do calcs for all but MAX
                {
                    string g = terms[i];
                    if (w != g)                     //skip where on the diagonal
                    {
                        int     nw = termNw[w];
                        decimal Pg = termPg[g];
                        decimal D  = nw * Pg;
                        if (D != 0.0m)
                        {
                            decimal Fwg = termFwg[w][terms[i]];
                            decimal T   = Fwg - D;
                            decimal Z   = (T * T) / D;
                            sumZ += Z;
                        }
                    }
                }
                termsX2[w] = sumZ;
            }

            SortedDictionary <decimal, string> sortedX2 = new SortedDictionary <decimal, string>();

            foreach (KeyValuePair <string, decimal> pair in termsX2)
            {
                decimal x2 = pair.Value;
                while (sortedX2.ContainsKey(x2))
                {
                    x2 = x2 - 0.00001m;
                }
                sortedX2.Add(x2, pair.Key);
            }

            //now get simple array of values as lowest to highest X2 terms
            string[] x2Terms = new string[sortedX2.Count];
            sortedX2.Values.CopyTo(x2Terms, 0);

            Dictionary <string, decimal> preres = new Dictionary <string, decimal>();

            for (int i = x2Terms.Length - 1; i > -1; i--)
            {
                string stemterm = x2Terms[i];
                string term     = GetTermFromStemTerm(allWords, stemterm);
                if (!preres.ContainsKey(term))
                {
                    preres.Add(term, termsX2[x2Terms[i]]);
                }
                else
                {
                    preres[term] = termsX2[x2Terms[i]];
                }
            }

            //post process title case and caseSpecial words
            //titles = new Dictionary<string, int>();
            //caselist = new Dictionary<string, int>();
            //caseListWords -- so we don't have to regex slit the caselist words
            //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking
            SortedDictionary <decimal, string> tsort = new SortedDictionary <decimal, string>();

            foreach (var title in titles)
            {
                decimal         tscore = 0.0m;
                MatchCollection mc     = WordScraper.WordReg.Matches(title.Text);
                foreach (Match m in mc)
                {
                    if (preres.ContainsKey(m.Value))
                    {
                        tscore += preres[m.Value];
                    }
                }
                while (tsort.ContainsKey(tscore))
                {
                    tscore = tscore - 0.00001m;
                }
                tsort.Add(tscore, title.Text);
            }

            //mix tsort with preres and return the top 50
            foreach (KeyValuePair <string, decimal> pre in preres)
            {
                decimal x = pre.Value;
                while (tsort.ContainsKey(x))
                {
                    x = x - 0.00001m;
                }
                tsort.Add(x, pre.Key);
            }

            Dictionary <string, decimal> result = new Dictionary <string, decimal>();

            string[] resultTerms = new string[tsort.Count];
            tsort.Values.CopyTo(resultTerms, 0);
            decimal[] resultValues = new decimal[tsort.Count];
            tsort.Keys.CopyTo(resultValues, 0);
            int max = 0;

            for (int i = resultTerms.Length - 1; i > -1; i--)
            {
                if (!result.ContainsKey(resultTerms[i]))
                {
                    result.Add(resultTerms[i], resultValues[i]);
                }
                //if (max > 50) break;
                max++;
            }

            analysis.Keywords = from n in result select new Keyword {
                Word = n.Key, Rank = n.Value
            };
            return(analysis);
        }
        public KeywordAnalysis Analyze(string content)
        {
            KeywordAnalysis analysis = new KeywordAnalysis { Content = content };
            int wordCount = 0;
            var titles = TitleExtractor.Extract(content);
            var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount);

            //flatten list of words
            List<Word> allWords = new List<Word>();
            paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words)));

            analysis.WordCount = wordCount;
            analysis.Paragraphs = paragraphs;
            analysis.Titles = titles;

            int termTotal = 0;

            //run through each sentence and grab two and three word segments and add them to the termCount
            Dictionary<string, int> termOccurrenceCounts = GetWordTermOccurence(paragraphs);

            Dictionary<string, int> termNw = new Dictionary<string, int>();
            Dictionary<string, decimal> termsX2 = new Dictionary<string, decimal>();

            //this gets us termsG for frequent terms, and an initialized termsX2
            SortedDictionary<decimal, string> termsG = SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal);

            //now we have to fill termPg and termNw with values
            Dictionary<string, decimal> termPg = FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal);

            //now we have to fill the termFgw collection
            Dictionary<string, Dictionary<string, decimal>> termFwg = FillTermFwgCollection(paragraphs, termsG);

            string[] terms = new string[termsG.Count];
            termsG.Values.CopyTo(terms, 0);  //gives terms array where last term is the MAX g in G
            foreach (string w in terms)
            {
                decimal sumZ = 0;
                for (int i = 0; i < terms.Length - 1; i++) //do calcs for all but MAX
                {
                    string g = terms[i];
                    if (w != g) //skip where on the diagonal
                    {
                        int nw = termNw[w];
                        decimal Pg = termPg[g];
                        decimal D = nw * Pg;
                        if (D != 0.0m)
                        {
                            decimal Fwg = termFwg[w][terms[i]];
                            decimal T = Fwg - D;
                            decimal Z = (T * T) / D;
                            sumZ += Z;
                        }
                    }
                }
                termsX2[w] = sumZ;
            }

            SortedDictionary<decimal, string> sortedX2 = new SortedDictionary<decimal, string>();
            foreach (KeyValuePair<string, decimal> pair in termsX2)
            {
                decimal x2 = pair.Value;
                while (sortedX2.ContainsKey(x2))
                {
                    x2 = x2 - 0.00001m;
                }
                sortedX2.Add(x2, pair.Key);
            }

            //now get simple array of values as lowest to highest X2 terms
            string[] x2Terms = new string[sortedX2.Count];
            sortedX2.Values.CopyTo(x2Terms, 0);

            Dictionary<string, decimal> preres = new Dictionary<string, decimal>();
            for (int i = x2Terms.Length - 1; i > -1; i--)
            {
                string stemterm = x2Terms[i];
                string term = GetTermFromStemTerm(allWords, stemterm);
                if (!preres.ContainsKey(term))
                    preres.Add(term, termsX2[x2Terms[i]]);
                else
                    preres[term] = termsX2[x2Terms[i]];
            }

            //post process title case and caseSpecial words
            //titles = new Dictionary<string, int>();
            //caselist = new Dictionary<string, int>();
            //caseListWords -- so we don't have to regex slit the caselist words
            //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking
            SortedDictionary<decimal, string> tsort = new SortedDictionary<decimal, string>();
            foreach (var title in titles)
            {
                decimal tscore = 0.0m;
                MatchCollection mc = WordScraper.WordReg.Matches(title.Text);
                foreach (Match m in mc)
                {
                    if (preres.ContainsKey(m.Value))
                    {
                        tscore += preres[m.Value];
                    }
                }
                while (tsort.ContainsKey(tscore))
                {
                    tscore = tscore - 0.00001m;
                }
                tsort.Add(tscore, title.Text);
            }

            //mix tsort with preres and return the top 50
            foreach (KeyValuePair<string, decimal> pre in preres)
            {
                decimal x = pre.Value;
                while (tsort.ContainsKey(x))
                {
                    x = x - 0.00001m;
                }
                tsort.Add(x, pre.Key);
            }

            Dictionary<string, decimal> result = new Dictionary<string, decimal>();
            string[] resultTerms = new string[tsort.Count];
            tsort.Values.CopyTo(resultTerms, 0);
            decimal[] resultValues = new decimal[tsort.Count];
            tsort.Keys.CopyTo(resultValues, 0);
            int max = 0;
            for (int i = resultTerms.Length - 1; i > -1; i--)
            {
                if (!result.ContainsKey(resultTerms[i]))
                {
                    result.Add(resultTerms[i], resultValues[i]);
                }
                //if (max > 50) break;
                max++;
            }

            analysis.Keywords = from n in result select new Keyword { Word = n.Key, Rank = n.Value };
            return analysis;
        }