private void cbLangCompare_SelectedIndexChanged(object sender, EventArgs e) { this.lvAnalyzeCharsetResult.BeginUpdate(); this.lvAnalyzeCharsetResult.Items.Clear(); this.lvAnalyzeNGramsResult.BeginUpdate(); this.lvAnalyzeNGramsResult.Items.Clear(); this.lvAnalyzeWordResult.BeginUpdate(); this.lvAnalyzeWordResult.Items.Clear(); if (this.cbLangCompare.SelectedIndex != -1) { TokenTable testTable = new TokenTable(this.tbSource.Text); ITokenTable compareTable = this.m_Model[((CultureInfo)cbLangCompare.SelectedItem).TwoLetterISOLanguageName]; List <TokenStats> scores = new List <TokenStats>(); int score = compareTable.Count * testTable.Count; foreach (ITokenStats test in testTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Count; score -= compareTable.Count; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; // abuse the ran field to store the occurences... newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } tbSumTokens.Text = score.ToString(); scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeNGramsResult.Items.Add(item); } scores = new List <TokenStats>(); score = compareTable.WordTable.Count * testTable.WordTable.Count; foreach (ITokenStats test in testTable.WordTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.WordTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Ranks; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } int hits = 0; Double wsScore = compareTable.WordComparisonScore(testTable, 0, ref hits); tbSumWords.Text = wsScore.ToString("0.00") + " (" + hits + ")"; scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeWordResult.Items.Add(item); } scores = new List <TokenStats>(); score = compareTable.CharsetTable.Count * testTable.CharsetTable.Count; foreach (ITokenStats test in testTable.CharsetTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.CharsetTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Ranks; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } textBox1.Text = score.ToString(); scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeCharsetResult.Items.Add(item); } } this.lvAnalyzeNGramsResult.EndUpdate(); this.lvAnalyzeWordResult.EndUpdate(); this.lvAnalyzeCharsetResult.EndUpdate(); }
public ICategoryList ClassifyText(string text, int maxResults) { long startTime = DateTime.Now.Ticks; ListDictionary results = new ListDictionary(); Dictionary <string, double> scores = new Dictionary <string, double>(); TokenTable tblTest = new TokenTable(text); double maxScore = 0; double threshold = 0; List <TokenVoter> charsetVoters = new List <TokenVoter>(); // collect stats based on charset (first filter) foreach (string category in this.Keys) { ITokenTable catTable = this[category]; if (!catTable.Enabled) { continue; } double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold); if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { charsetVoters.Add(new TokenVoter(category, score)); } } // chinese does not have a "Charset"... so to be sure.... if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh"))) { charsetVoters.Add(new TokenVoter("zh")); } charsetVoters.Sort(); for (int i = charsetVoters.Count - 1; i > -1; i--) { if (charsetVoters[i].Score < threshold) { charsetVoters.RemoveAt(i); } } maxScore = 0;; // collect scores for each table int maxWordHits = 0; threshold = 0; foreach (TokenVoter charVoter in charsetVoters) { ITokenTable catTable = this[charVoter.Category]; if (!catTable.Enabled) { continue; } int hits = 0; double score = catTable.WordComparisonScore(tblTest, threshold, ref hits); if (hits > maxWordHits) { maxWordHits = hits; } if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { scores.Add(charVoter.Category, score); } } double sumScore = 0; List <TokenVoter> voters = new List <TokenVoter>(); if (scores.Count == 0) { maxScore = charsetVoters[0].Score;; // take the voters from the closed charsert foreach (TokenVoter v in charsetVoters) { scores.Add(v.Category, v.Score); } } threshold = (maxScore * m_Threshold); // copy the scores to a sorted voters list foreach (string key in scores.Keys) { /* if ((long)scores[key] < threshold) * continue; */ // calc sum score double score = scores[key]; /* * if (maxWordHits < 1) * { * score = 0; // score > 0 ? 1 : 0; * } * else*/ if (score > threshold) { score /= maxScore; if (maxWordHits > 0) { score /= maxWordHits; } score *= 100; sumScore += score; voters.Add(new TokenVoter(key, score)); } } if (voters.Count > 1) { if (sumScore > 0) { voters.Sort(); // cleanup voters and rebalance if more than 3 voters... if (voters.Count > m_MaxVoters) { sumScore = 0; for (int i = 0; i < m_MaxVoters; i++) { ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score; sumScore += ((TokenVoter)voters[i]).Score; } voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters); } } } // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so it we are 100% sure that's the one! ScoreHolder newScore = new ScoreHolder(100); results.Add(((TokenVoter)voters[0]).Category, newScore); } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = sumScore > 0 ? (stats.Score / sumScore) * 100 : 0; results.Add(stats.Category, new ScoreHolder(percScore)); } // if we have more than one possible result // we will try to disambiguate it by checking for // very common words if ((results.Count == 0) || (results.Count > 1)) { scores.Clear(); maxScore = 0; threshold = 0; // collect scores for each table foreach (string category in results.Keys) { ITokenTable catTable = (ITokenTable)this[category]; // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count; double score = catTable.ComparisonScore(tblTest, threshold); if (score > 0) { maxScore = System.Math.Max(maxScore, score); scores.Add(category, score); } } // got results? if (scores.Count > 0) { sumScore = 0; // copy the scores to a sorted voters list voters.Clear(); foreach (string key in scores.Keys) { // calc sum score sumScore += scores[key]; voters.Add(new TokenVoter(key, scores[key])); } voters.Sort(); // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so all other results are only 3/4 value foreach (string category in results.Keys) { if (category != ((TokenVoter)voters[0]).Category) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = (stats.Score / sumScore) * 200; ((ScoreHolder)results[stats.Category]).AddScore(percScore); } foreach (string category in results.Keys) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } } } // now build a proper result.. voters.Clear(); foreach (string key in results.Keys) { voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score)); } voters.Sort(); /* * // Do a distance to next boos * for (int i = 0; i < voters.Count-1; i++) * { * voters[i].Score += voters[i].Score - voters[i + 1].Score; * } */ // reduce to maximum results if (voters.Count > maxResults) { voters.RemoveRange(maxResults, voters.Count - maxResults); } // re-weight... double dSumScore = 0; foreach (TokenVoter voter in voters) { dSumScore += voter.Score; } results.Clear(); foreach (TokenVoter voter in voters) { results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100)); } // ArrayList resultList = new ArrayList(results.Values); // resultList.Sort CategoryList result = new CategoryList(); foreach (string category in results.Keys) { result.Add(new Category(category, ((ScoreHolder)results[category]).Score)); } result.Sort(); #if DIALOGUEMASTER if (UseCounters) { m_Counters.Classifications.Increment(); m_Counters.ClassificationsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime); m_Counters.ComparisonTimeBase.Increment(); } #endif tblTest.Clear(); return(result); }