public KeywordAnalysis Analyze(string content) { KeywordAnalysis analysis = new KeywordAnalysis { Content = content }; int wordCount = 0; var titles = TitleExtractor.Extract(content); var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount); //flatten list of words List <Word> allWords = new List <Word>(); paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words))); analysis.WordCount = wordCount; analysis.Paragraphs = paragraphs; analysis.Titles = titles; int termTotal = 0; //run through each sentence and grab two and three word segments and add them to the termCount Dictionary <string, int> termOccurrenceCounts = GetWordTermOccurence(paragraphs); Dictionary <string, int> termNw = new Dictionary <string, int>(); Dictionary <string, decimal> termsX2 = new Dictionary <string, decimal>(); //this gets us termsG for frequent terms, and an initialized termsX2 SortedDictionary <decimal, string> termsG = SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal); //now we have to fill termPg and termNw with values Dictionary <string, decimal> termPg = FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal); //now we have to fill the termFgw collection Dictionary <string, Dictionary <string, decimal> > termFwg = FillTermFwgCollection(paragraphs, termsG); string[] terms = new string[termsG.Count]; termsG.Values.CopyTo(terms, 0); //gives terms array where last term is the MAX g in G foreach (string w in terms) { decimal sumZ = 0; for (int i = 0; i < terms.Length - 1; i++) //do calcs for all but MAX { string g = terms[i]; if (w != g) //skip where on the diagonal { int nw = termNw[w]; decimal Pg = termPg[g]; decimal D = nw * Pg; if (D != 0.0m) { decimal Fwg = termFwg[w][terms[i]]; decimal T = Fwg - D; decimal Z = (T * T) / D; sumZ += Z; } } } termsX2[w] = sumZ; } SortedDictionary <decimal, string> sortedX2 = new SortedDictionary <decimal, string>(); foreach (KeyValuePair <string, decimal> pair in termsX2) { decimal x2 = pair.Value; while (sortedX2.ContainsKey(x2)) { x2 = x2 - 0.00001m; } sortedX2.Add(x2, pair.Key); } //now get simple array of values as lowest to highest X2 terms string[] x2Terms = new string[sortedX2.Count]; sortedX2.Values.CopyTo(x2Terms, 0); Dictionary <string, decimal> preres = new Dictionary <string, decimal>(); for (int i = x2Terms.Length - 1; i > -1; i--) { string stemterm = x2Terms[i]; string term = GetTermFromStemTerm(allWords, stemterm); if (!preres.ContainsKey(term)) { preres.Add(term, termsX2[x2Terms[i]]); } else { preres[term] = termsX2[x2Terms[i]]; } } //post process title case and caseSpecial words //titles = new Dictionary<string, int>(); //caselist = new Dictionary<string, int>(); //caseListWords -- so we don't have to regex slit the caselist words //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking SortedDictionary <decimal, string> tsort = new SortedDictionary <decimal, string>(); foreach (var title in titles) { decimal tscore = 0.0m; MatchCollection mc = WordScraper.WordReg.Matches(title.Text); foreach (Match m in mc) { if (preres.ContainsKey(m.Value)) { tscore += preres[m.Value]; } } while (tsort.ContainsKey(tscore)) { tscore = tscore - 0.00001m; } tsort.Add(tscore, title.Text); } //mix tsort with preres and return the top 50 foreach (KeyValuePair <string, decimal> pre in preres) { decimal x = pre.Value; while (tsort.ContainsKey(x)) { x = x - 0.00001m; } tsort.Add(x, pre.Key); } Dictionary <string, decimal> result = new Dictionary <string, decimal>(); string[] resultTerms = new string[tsort.Count]; tsort.Values.CopyTo(resultTerms, 0); decimal[] resultValues = new decimal[tsort.Count]; tsort.Keys.CopyTo(resultValues, 0); int max = 0; for (int i = resultTerms.Length - 1; i > -1; i--) { if (!result.ContainsKey(resultTerms[i])) { result.Add(resultTerms[i], resultValues[i]); } //if (max > 50) break; max++; } analysis.Keywords = from n in result select new Keyword { Word = n.Key, Rank = n.Value }; return(analysis); }
public KeywordAnalysis Analyze(string content) { KeywordAnalysis analysis = new KeywordAnalysis { Content = content }; int wordCount = 0; var titles = TitleExtractor.Extract(content); var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount); //flatten list of words List<Word> allWords = new List<Word>(); paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words))); analysis.WordCount = wordCount; analysis.Paragraphs = paragraphs; analysis.Titles = titles; int termTotal = 0; //run through each sentence and grab two and three word segments and add them to the termCount Dictionary<string, int> termOccurrenceCounts = GetWordTermOccurence(paragraphs); Dictionary<string, int> termNw = new Dictionary<string, int>(); Dictionary<string, decimal> termsX2 = new Dictionary<string, decimal>(); //this gets us termsG for frequent terms, and an initialized termsX2 SortedDictionary<decimal, string> termsG = SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal); //now we have to fill termPg and termNw with values Dictionary<string, decimal> termPg = FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal); //now we have to fill the termFgw collection Dictionary<string, Dictionary<string, decimal>> termFwg = FillTermFwgCollection(paragraphs, termsG); string[] terms = new string[termsG.Count]; termsG.Values.CopyTo(terms, 0); //gives terms array where last term is the MAX g in G foreach (string w in terms) { decimal sumZ = 0; for (int i = 0; i < terms.Length - 1; i++) //do calcs for all but MAX { string g = terms[i]; if (w != g) //skip where on the diagonal { int nw = termNw[w]; decimal Pg = termPg[g]; decimal D = nw * Pg; if (D != 0.0m) { decimal Fwg = termFwg[w][terms[i]]; decimal T = Fwg - D; decimal Z = (T * T) / D; sumZ += Z; } } } termsX2[w] = sumZ; } SortedDictionary<decimal, string> sortedX2 = new SortedDictionary<decimal, string>(); foreach (KeyValuePair<string, decimal> pair in termsX2) { decimal x2 = pair.Value; while (sortedX2.ContainsKey(x2)) { x2 = x2 - 0.00001m; } sortedX2.Add(x2, pair.Key); } //now get simple array of values as lowest to highest X2 terms string[] x2Terms = new string[sortedX2.Count]; sortedX2.Values.CopyTo(x2Terms, 0); Dictionary<string, decimal> preres = new Dictionary<string, decimal>(); for (int i = x2Terms.Length - 1; i > -1; i--) { string stemterm = x2Terms[i]; string term = GetTermFromStemTerm(allWords, stemterm); if (!preres.ContainsKey(term)) preres.Add(term, termsX2[x2Terms[i]]); else preres[term] = termsX2[x2Terms[i]]; } //post process title case and caseSpecial words //titles = new Dictionary<string, int>(); //caselist = new Dictionary<string, int>(); //caseListWords -- so we don't have to regex slit the caselist words //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking SortedDictionary<decimal, string> tsort = new SortedDictionary<decimal, string>(); foreach (var title in titles) { decimal tscore = 0.0m; MatchCollection mc = WordScraper.WordReg.Matches(title.Text); foreach (Match m in mc) { if (preres.ContainsKey(m.Value)) { tscore += preres[m.Value]; } } while (tsort.ContainsKey(tscore)) { tscore = tscore - 0.00001m; } tsort.Add(tscore, title.Text); } //mix tsort with preres and return the top 50 foreach (KeyValuePair<string, decimal> pre in preres) { decimal x = pre.Value; while (tsort.ContainsKey(x)) { x = x - 0.00001m; } tsort.Add(x, pre.Key); } Dictionary<string, decimal> result = new Dictionary<string, decimal>(); string[] resultTerms = new string[tsort.Count]; tsort.Values.CopyTo(resultTerms, 0); decimal[] resultValues = new decimal[tsort.Count]; tsort.Keys.CopyTo(resultValues, 0); int max = 0; for (int i = resultTerms.Length - 1; i > -1; i--) { if (!result.ContainsKey(resultTerms[i])) { result.Add(resultTerms[i], resultValues[i]); } //if (max > 50) break; max++; } analysis.Keywords = from n in result select new Keyword { Word = n.Key, Rank = n.Value }; return analysis; }