private void cbTables_SelectedIndexChanged(object sender, EventArgs e) { string lang = ((CultureInfo)cbTables.SelectedItem).TwoLetterISOLanguageName; ITokenTable table = this.m_Model[lang] as TokenTable; int maxRank = 0; int maxWordRank = 0; int maxCharRank = 0; lvNGrams.BeginUpdate(); lvNGrams.Items.Clear(); lvWords.BeginUpdate(); lvWords.Items.Clear(); listView3.BeginUpdate(); listView3.Items.Clear(); lvCharset.BeginUpdate(); lvCharset.Items.Clear(); foreach (string key in table.Keys) { ITokenStats stats = table[key]; maxRank = System.Math.Max(maxRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); StringBuilder sbHex = new StringBuilder(); foreach (char c in stats.Token) { int Val = (int)c; if (c > 255) { sbHex.AppendFormat("{0}{1} ", Convert.ToString((byte)((c & 0xff00) >> 8), 16), Convert.ToString((byte)c, 16)); } else { sbHex.AppendFormat("{0} ", Convert.ToString((byte)c, 16)); } } item.SubItems.Add(sbHex.ToString().Trim()); lvNGrams.Items.Add(item); } ITokenTable wordTable = table.WordTable; foreach (string key in wordTable.Keys) { ITokenStats stats = wordTable[key]; maxWordRank = System.Math.Max(maxWordRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); lvWords.Items.Add(item); } ITokenTable charsetTable = table.CharsetTable; foreach (string key in charsetTable.Keys) { ITokenStats stats = charsetTable[key]; maxCharRank = System.Math.Max(maxCharRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); lvCharset.Items.Add(item); } this.tbTokenRanks.Text = maxRank.ToString(); this.tbWordRanks.Text = maxWordRank.ToString(); this.tbCharRanks.Text = maxCharRank.ToString(); double maxScore = 0; List <TableVoter> tables = new List <TableVoter>(); foreach (string key in this.m_Model.Keys) { TableVoter tableVoter = new TableVoter(key, table.ComparisonScore(this.m_Model[key], 0)); maxScore = Math.Max(maxScore, tableVoter.Score); tables.Add(tableVoter); } tables.Sort(); foreach (TableVoter voter in tables) { if (voter.Language != lang) { voter.Score /= maxScore; // voter.Score = 100 - voter.Score *200; voter.Score = Math.Max(0, 100 - (voter.Score * 150)); // from 90 on there is not enough similarity to be wrongly detected if (voter.Score > 90) { continue; } ListViewItem item = new ListViewItem(voter.Language); item.SubItems.Add(GetLanguageCulture(voter.Language).DisplayName.ToString()); item.SubItems.Add(voter.Score.ToString("0.00")); listView3.Items.Add(item); } } lvNGrams.EndUpdate(); lvWords.EndUpdate(); listView3.EndUpdate(); lvCharset.EndUpdate(); }
public ICategoryList ClassifyText(string text, int maxResults) { long startTime = DateTime.Now.Ticks; ListDictionary results = new ListDictionary(); Dictionary <string, double> scores = new Dictionary <string, double>(); TokenTable tblTest = new TokenTable(text); double maxScore = 0; double threshold = 0; List <TokenVoter> charsetVoters = new List <TokenVoter>(); // collect stats based on charset (first filter) foreach (string category in this.Keys) { ITokenTable catTable = this[category]; if (!catTable.Enabled) { continue; } double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold); if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { charsetVoters.Add(new TokenVoter(category, score)); } } // chinese does not have a "Charset"... so to be sure.... if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh"))) { charsetVoters.Add(new TokenVoter("zh")); } charsetVoters.Sort(); for (int i = charsetVoters.Count - 1; i > -1; i--) { if (charsetVoters[i].Score < threshold) { charsetVoters.RemoveAt(i); } } maxScore = 0;; // collect scores for each table int maxWordHits = 0; threshold = 0; foreach (TokenVoter charVoter in charsetVoters) { ITokenTable catTable = this[charVoter.Category]; if (!catTable.Enabled) { continue; } int hits = 0; double score = catTable.WordComparisonScore(tblTest, threshold, ref hits); if (hits > maxWordHits) { maxWordHits = hits; } if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { scores.Add(charVoter.Category, score); } } double sumScore = 0; List <TokenVoter> voters = new List <TokenVoter>(); if (scores.Count == 0) { maxScore = charsetVoters[0].Score;; // take the voters from the closed charsert foreach (TokenVoter v in charsetVoters) { scores.Add(v.Category, v.Score); } } threshold = (maxScore * m_Threshold); // copy the scores to a sorted voters list foreach (string key in scores.Keys) { /* if ((long)scores[key] < threshold) * continue; */ // calc sum score double score = scores[key]; /* * if (maxWordHits < 1) * { * score = 0; // score > 0 ? 1 : 0; * } * else*/ if (score > threshold) { score /= maxScore; if (maxWordHits > 0) { score /= maxWordHits; } score *= 100; sumScore += score; voters.Add(new TokenVoter(key, score)); } } if (voters.Count > 1) { if (sumScore > 0) { voters.Sort(); // cleanup voters and rebalance if more than 3 voters... if (voters.Count > m_MaxVoters) { sumScore = 0; for (int i = 0; i < m_MaxVoters; i++) { ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score; sumScore += ((TokenVoter)voters[i]).Score; } voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters); } } } // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so it we are 100% sure that's the one! ScoreHolder newScore = new ScoreHolder(100); results.Add(((TokenVoter)voters[0]).Category, newScore); } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = sumScore > 0 ? (stats.Score / sumScore) * 100 : 0; results.Add(stats.Category, new ScoreHolder(percScore)); } // if we have more than one possible result // we will try to disambiguate it by checking for // very common words if ((results.Count == 0) || (results.Count > 1)) { scores.Clear(); maxScore = 0; threshold = 0; // collect scores for each table foreach (string category in results.Keys) { ITokenTable catTable = (ITokenTable)this[category]; // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count; double score = catTable.ComparisonScore(tblTest, threshold); if (score > 0) { maxScore = System.Math.Max(maxScore, score); scores.Add(category, score); } } // got results? if (scores.Count > 0) { sumScore = 0; // copy the scores to a sorted voters list voters.Clear(); foreach (string key in scores.Keys) { // calc sum score sumScore += scores[key]; voters.Add(new TokenVoter(key, scores[key])); } voters.Sort(); // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so all other results are only 3/4 value foreach (string category in results.Keys) { if (category != ((TokenVoter)voters[0]).Category) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = (stats.Score / sumScore) * 200; ((ScoreHolder)results[stats.Category]).AddScore(percScore); } foreach (string category in results.Keys) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } } } // now build a proper result.. voters.Clear(); foreach (string key in results.Keys) { voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score)); } voters.Sort(); /* * // Do a distance to next boos * for (int i = 0; i < voters.Count-1; i++) * { * voters[i].Score += voters[i].Score - voters[i + 1].Score; * } */ // reduce to maximum results if (voters.Count > maxResults) { voters.RemoveRange(maxResults, voters.Count - maxResults); } // re-weight... double dSumScore = 0; foreach (TokenVoter voter in voters) { dSumScore += voter.Score; } results.Clear(); foreach (TokenVoter voter in voters) { results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100)); } // ArrayList resultList = new ArrayList(results.Values); // resultList.Sort CategoryList result = new CategoryList(); foreach (string category in results.Keys) { result.Add(new Category(category, ((ScoreHolder)results[category]).Score)); } result.Sort(); #if DIALOGUEMASTER if (UseCounters) { m_Counters.Classifications.Increment(); m_Counters.ClassificationsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime); m_Counters.ComparisonTimeBase.Increment(); } #endif tblTest.Clear(); return(result); }